--- /dev/null
+cmake_minimum_required(VERSION 3.7.2)
+
+project(pcre2)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_C_STANDARD 99)
+
+set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2)
+set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC)
+
+# Set PCRE2 options.
+set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE)
+set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE)
+
+# Always make a release build.
+set(CMAKE_BUILD_TYPE Release)
+
+# Build PCRE2 library as both shared and static.
+set(BUILD_STATIC_LIBS ON)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/libpcre2)
+
+# Build Cython code as shared.
+set(BUILD_STATIC_LIBS OFF)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/pcre2)
+
+# Include PCRE2 header for Cython API.
+install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2)
--- /dev/null
+BSD 3-Clause License
+
+Copyright (c) 2022, grtetrault
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null
+SHELL = /bin/bash
+
+init:
+ git submodule update --init
+ python3 -m venv ./.venv
+ ./.venv/bin/pip install -r ./requirements/build-requirements.txt
+ ./.venv/bin/pip install -r ./requirements/test-requirements.txt
+ ./.venv/bin/pip install .
+
+build:
+ ./.venv/bin/pip install . --force-reinstall
+
+clean:
+ rm -rf ./dist
+ rm -rf ./build
+ rm -rf ./_skbuild
+ find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf
+ find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf
+ find . -type f -name '*.pyc' | xargs rm -r
+ find . -type d -name '*.egg-info' | xargs rm -r
+ find . -type d -name '*.ipynb_checkpoints' | xargs rm -r
+
+purge:
+ rm -rf ./.venv
+
+benchmark:
+ ./.venv/bin/python ./benchmarks/run_regex_redux.py
--- /dev/null
+Metadata-Version: 2.1
+Name: pcre2
+Version: 0.4.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+... print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 |
+| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 |
+| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 |
+| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `vanilla.py` | Pure Python version |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py` | Implementation using Python bindings written here |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+... print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 |
+| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 |
+| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 |
+| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `vanilla.py` | Pure Python version |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py` | Implementation using Python bindings written here |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+[build-system]
+requires = [
+ "setuptools>=42",
+ "scikit-build",
+ "Cython",
+ "cmake"
+]
+build-backend = "setuptools.build_meta"
--- /dev/null
+requests
+build
+wheel
+scikit-build
+cmake
+Cython
\ No newline at end of file
--- /dev/null
+twine
+pytest
+gitpython
\ No newline at end of file
--- /dev/null
+[egg_info]
+tag_build =
+tag_date = 0
+
--- /dev/null
+# -*- coding:utf-8 -*-
+
+import os
+import skbuild
+import setuptools
+
+
+def get_long_desciption():
+ cwd = os.path.abspath(os.path.dirname(__file__))
+ filename = os.path.join(cwd, "README.md")
+ with open(filename) as f:
+ long_description = f.read()
+
+ return long_description
+
+
+skbuild.setup(
+ name = "pcre2",
+ version = "0.4.0",
+ description = "Python bindings for the PCRE2 regular expression library",
+ long_description = get_long_desciption(),
+ long_description_content_type = "text/markdown",
+ license = "BSD 3-Clause License",
+ author = "Garrett Tetrault",
+ url = "https://github.com/grtetrault/pcre2.py",
+ classifiers = [
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: BSD License",
+ "Programming Language :: C",
+ "Programming Language :: Cython",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Operating System :: MacOS :: MacOS X",
+ "Operating System :: POSIX :: Linux",
+ "Operating System :: Microsoft :: Windows"
+ ],
+ include_package_data=True,
+ packages = setuptools.find_packages("src"),
+ package_dir = {"": "src"},
+ cmake_languages = "C",
+)
--- /dev/null
+Metadata-Version: 2.1
+Name: pcre2
+Version: 0.4.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+... print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py` | 10 | 3.020 | 0.302 | 0.020 | 0.086 |
+| `vanilla.py` | 10 | 51.380 | 5.138 | 11.408 | 0.529 |
+| `hand_optimized.py` | 10 | 13.190 | 1.319 | 2.846 | 0.344 |
+| `pcre2_module.py` | 10 | 13.670 | 1.367 | 2.269 | 0.532 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `vanilla.py` | Pure Python version |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py` | Implementation using Python bindings written here |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+CMakeLists.txt
+LICENSE
+Makefile
+README.md
+pyproject.toml
+setup.py
+requirements/build-requirements.txt
+requirements/test-requirements.txt
+src/libpcre2/.bazelrc
+src/libpcre2/.git
+src/libpcre2/.gitignore
+src/libpcre2/132html
+src/libpcre2/AUTHORS
+src/libpcre2/BUILD.bazel
+src/libpcre2/CMakeLists.txt
+src/libpcre2/COPYING
+src/libpcre2/ChangeLog
+src/libpcre2/CheckMan
+src/libpcre2/CleanTxt
+src/libpcre2/Detrail
+src/libpcre2/HACKING
+src/libpcre2/LICENCE
+src/libpcre2/MODULE.bazel
+src/libpcre2/Makefile.am
+src/libpcre2/NEWS
+src/libpcre2/NON-AUTOTOOLS-BUILD
+src/libpcre2/PrepareRelease
+src/libpcre2/README
+src/libpcre2/README.md
+src/libpcre2/RunGrepTest
+src/libpcre2/RunGrepTest.bat
+src/libpcre2/RunTest
+src/libpcre2/RunTest.bat
+src/libpcre2/WORKSPACE.bazel
+src/libpcre2/autogen.sh
+src/libpcre2/build.zig
+src/libpcre2/config-cmake.h.in
+src/libpcre2/configure.ac
+src/libpcre2/index.md
+src/libpcre2/libpcre2-16.pc.in
+src/libpcre2/libpcre2-32.pc.in
+src/libpcre2/libpcre2-8.pc.in
+src/libpcre2/libpcre2-posix.pc.in
+src/libpcre2/pcre2-config.in
+src/libpcre2/pcre2_fuzzer.dict
+src/libpcre2/pcre2_fuzzer.options
+src/libpcre2/pcre2_fuzzer_16.dict
+src/libpcre2/pcre2_fuzzer_16.options
+src/libpcre2/pcre2_fuzzer_32.dict
+src/libpcre2/pcre2_fuzzer_32.options
+src/libpcre2/perltest.sh
+src/libpcre2/.github/workflows/build.yml
+src/libpcre2/.github/workflows/cifuzz.yml
+src/libpcre2/.github/workflows/codeql.yml
+src/libpcre2/.github/workflows/dev.yml
+src/libpcre2/.github/workflows/scorecards.yml
+src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS
+src/libpcre2/cmake/FindEditline.cmake
+src/libpcre2/cmake/FindPackageHandleStandardArgs.cmake
+src/libpcre2/cmake/FindReadline.cmake
+src/libpcre2/cmake/pcre2-config-version.cmake.in
+src/libpcre2/cmake/pcre2-config.cmake.in
+src/libpcre2/doc/index.html.src
+src/libpcre2/doc/pcre2-config.1
+src/libpcre2/doc/pcre2-config.txt
+src/libpcre2/doc/pcre2.3
+src/libpcre2/doc/pcre2.txt
+src/libpcre2/doc/pcre2_callout_enumerate.3
+src/libpcre2/doc/pcre2_code_copy.3
+src/libpcre2/doc/pcre2_code_copy_with_tables.3
+src/libpcre2/doc/pcre2_code_free.3
+src/libpcre2/doc/pcre2_compile.3
+src/libpcre2/doc/pcre2_compile_context_copy.3
+src/libpcre2/doc/pcre2_compile_context_create.3
+src/libpcre2/doc/pcre2_compile_context_free.3
+src/libpcre2/doc/pcre2_config.3
+src/libpcre2/doc/pcre2_convert_context_copy.3
+src/libpcre2/doc/pcre2_convert_context_create.3
+src/libpcre2/doc/pcre2_convert_context_free.3
+src/libpcre2/doc/pcre2_converted_pattern_free.3
+src/libpcre2/doc/pcre2_dfa_match.3
+src/libpcre2/doc/pcre2_general_context_copy.3
+src/libpcre2/doc/pcre2_general_context_create.3
+src/libpcre2/doc/pcre2_general_context_free.3
+src/libpcre2/doc/pcre2_get_error_message.3
+src/libpcre2/doc/pcre2_get_mark.3
+src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3
+src/libpcre2/doc/pcre2_get_match_data_size.3
+src/libpcre2/doc/pcre2_get_ovector_count.3
+src/libpcre2/doc/pcre2_get_ovector_pointer.3
+src/libpcre2/doc/pcre2_get_startchar.3
+src/libpcre2/doc/pcre2_jit_compile.3
+src/libpcre2/doc/pcre2_jit_free_unused_memory.3
+src/libpcre2/doc/pcre2_jit_match.3
+src/libpcre2/doc/pcre2_jit_stack_assign.3
+src/libpcre2/doc/pcre2_jit_stack_create.3
+src/libpcre2/doc/pcre2_jit_stack_free.3
+src/libpcre2/doc/pcre2_maketables.3
+src/libpcre2/doc/pcre2_maketables_free.3
+src/libpcre2/doc/pcre2_match.3
+src/libpcre2/doc/pcre2_match_context_copy.3
+src/libpcre2/doc/pcre2_match_context_create.3
+src/libpcre2/doc/pcre2_match_context_free.3
+src/libpcre2/doc/pcre2_match_data_create.3
+src/libpcre2/doc/pcre2_match_data_create_from_pattern.3
+src/libpcre2/doc/pcre2_match_data_free.3
+src/libpcre2/doc/pcre2_pattern_convert.3
+src/libpcre2/doc/pcre2_pattern_info.3
+src/libpcre2/doc/pcre2_serialize_decode.3
+src/libpcre2/doc/pcre2_serialize_encode.3
+src/libpcre2/doc/pcre2_serialize_free.3
+src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3
+src/libpcre2/doc/pcre2_set_bsr.3
+src/libpcre2/doc/pcre2_set_callout.3
+src/libpcre2/doc/pcre2_set_character_tables.3
+src/libpcre2/doc/pcre2_set_compile_extra_options.3
+src/libpcre2/doc/pcre2_set_compile_recursion_guard.3
+src/libpcre2/doc/pcre2_set_depth_limit.3
+src/libpcre2/doc/pcre2_set_glob_escape.3
+src/libpcre2/doc/pcre2_set_glob_separator.3
+src/libpcre2/doc/pcre2_set_heap_limit.3
+src/libpcre2/doc/pcre2_set_match_limit.3
+src/libpcre2/doc/pcre2_set_max_pattern_length.3
+src/libpcre2/doc/pcre2_set_max_varlookbehind.3
+src/libpcre2/doc/pcre2_set_newline.3
+src/libpcre2/doc/pcre2_set_offset_limit.3
+src/libpcre2/doc/pcre2_set_parens_nest_limit.3
+src/libpcre2/doc/pcre2_set_recursion_limit.3
+src/libpcre2/doc/pcre2_set_recursion_memory_management.3
+src/libpcre2/doc/pcre2_set_substitute_callout.3
+src/libpcre2/doc/pcre2_substitute.3
+src/libpcre2/doc/pcre2_substring_copy_byname.3
+src/libpcre2/doc/pcre2_substring_copy_bynumber.3
+src/libpcre2/doc/pcre2_substring_free.3
+src/libpcre2/doc/pcre2_substring_get_byname.3
+src/libpcre2/doc/pcre2_substring_get_bynumber.3
+src/libpcre2/doc/pcre2_substring_length_byname.3
+src/libpcre2/doc/pcre2_substring_length_bynumber.3
+src/libpcre2/doc/pcre2_substring_list_free.3
+src/libpcre2/doc/pcre2_substring_list_get.3
+src/libpcre2/doc/pcre2_substring_nametable_scan.3
+src/libpcre2/doc/pcre2_substring_number_from_name.3
+src/libpcre2/doc/pcre2api.3
+src/libpcre2/doc/pcre2build.3
+src/libpcre2/doc/pcre2callout.3
+src/libpcre2/doc/pcre2compat.3
+src/libpcre2/doc/pcre2convert.3
+src/libpcre2/doc/pcre2demo.3
+src/libpcre2/doc/pcre2grep.1
+src/libpcre2/doc/pcre2grep.txt
+src/libpcre2/doc/pcre2jit.3
+src/libpcre2/doc/pcre2limits.3
+src/libpcre2/doc/pcre2matching.3
+src/libpcre2/doc/pcre2partial.3
+src/libpcre2/doc/pcre2pattern.3
+src/libpcre2/doc/pcre2perform.3
+src/libpcre2/doc/pcre2posix.3
+src/libpcre2/doc/pcre2sample.3
+src/libpcre2/doc/pcre2serialize.3
+src/libpcre2/doc/pcre2syntax.3
+src/libpcre2/doc/pcre2test.1
+src/libpcre2/doc/pcre2test.txt
+src/libpcre2/doc/pcre2unicode.3
+src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt
+src/libpcre2/doc/html/README.txt
+src/libpcre2/doc/html/index.html
+src/libpcre2/doc/html/pcre2-config.html
+src/libpcre2/doc/html/pcre2.html
+src/libpcre2/doc/html/pcre2_callout_enumerate.html
+src/libpcre2/doc/html/pcre2_code_copy.html
+src/libpcre2/doc/html/pcre2_code_copy_with_tables.html
+src/libpcre2/doc/html/pcre2_code_free.html
+src/libpcre2/doc/html/pcre2_compile.html
+src/libpcre2/doc/html/pcre2_compile_context_copy.html
+src/libpcre2/doc/html/pcre2_compile_context_create.html
+src/libpcre2/doc/html/pcre2_compile_context_free.html
+src/libpcre2/doc/html/pcre2_config.html
+src/libpcre2/doc/html/pcre2_convert_context_copy.html
+src/libpcre2/doc/html/pcre2_convert_context_create.html
+src/libpcre2/doc/html/pcre2_convert_context_free.html
+src/libpcre2/doc/html/pcre2_converted_pattern_free.html
+src/libpcre2/doc/html/pcre2_dfa_match.html
+src/libpcre2/doc/html/pcre2_general_context_copy.html
+src/libpcre2/doc/html/pcre2_general_context_create.html
+src/libpcre2/doc/html/pcre2_general_context_free.html
+src/libpcre2/doc/html/pcre2_get_error_message.html
+src/libpcre2/doc/html/pcre2_get_mark.html
+src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html
+src/libpcre2/doc/html/pcre2_get_match_data_size.html
+src/libpcre2/doc/html/pcre2_get_ovector_count.html
+src/libpcre2/doc/html/pcre2_get_ovector_pointer.html
+src/libpcre2/doc/html/pcre2_get_startchar.html
+src/libpcre2/doc/html/pcre2_jit_compile.html
+src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html
+src/libpcre2/doc/html/pcre2_jit_match.html
+src/libpcre2/doc/html/pcre2_jit_stack_assign.html
+src/libpcre2/doc/html/pcre2_jit_stack_create.html
+src/libpcre2/doc/html/pcre2_jit_stack_free.html
+src/libpcre2/doc/html/pcre2_maketables.html
+src/libpcre2/doc/html/pcre2_maketables_free.html
+src/libpcre2/doc/html/pcre2_match.html
+src/libpcre2/doc/html/pcre2_match_context_copy.html
+src/libpcre2/doc/html/pcre2_match_context_create.html
+src/libpcre2/doc/html/pcre2_match_context_free.html
+src/libpcre2/doc/html/pcre2_match_data_create.html
+src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html
+src/libpcre2/doc/html/pcre2_match_data_free.html
+src/libpcre2/doc/html/pcre2_pattern_convert.html
+src/libpcre2/doc/html/pcre2_pattern_info.html
+src/libpcre2/doc/html/pcre2_serialize_decode.html
+src/libpcre2/doc/html/pcre2_serialize_encode.html
+src/libpcre2/doc/html/pcre2_serialize_free.html
+src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html
+src/libpcre2/doc/html/pcre2_set_bsr.html
+src/libpcre2/doc/html/pcre2_set_callout.html
+src/libpcre2/doc/html/pcre2_set_character_tables.html
+src/libpcre2/doc/html/pcre2_set_compile_extra_options.html
+src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html
+src/libpcre2/doc/html/pcre2_set_depth_limit.html
+src/libpcre2/doc/html/pcre2_set_glob_escape.html
+src/libpcre2/doc/html/pcre2_set_glob_separator.html
+src/libpcre2/doc/html/pcre2_set_heap_limit.html
+src/libpcre2/doc/html/pcre2_set_match_limit.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_length.html
+src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html
+src/libpcre2/doc/html/pcre2_set_newline.html
+src/libpcre2/doc/html/pcre2_set_offset_limit.html
+src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html
+src/libpcre2/doc/html/pcre2_set_substitute_callout.html
+src/libpcre2/doc/html/pcre2_substitute.html
+src/libpcre2/doc/html/pcre2_substring_copy_byname.html
+src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_free.html
+src/libpcre2/doc/html/pcre2_substring_get_byname.html
+src/libpcre2/doc/html/pcre2_substring_get_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_length_byname.html
+src/libpcre2/doc/html/pcre2_substring_length_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_list_free.html
+src/libpcre2/doc/html/pcre2_substring_list_get.html
+src/libpcre2/doc/html/pcre2_substring_nametable_scan.html
+src/libpcre2/doc/html/pcre2_substring_number_from_name.html
+src/libpcre2/doc/html/pcre2api.html
+src/libpcre2/doc/html/pcre2build.html
+src/libpcre2/doc/html/pcre2callout.html
+src/libpcre2/doc/html/pcre2compat.html
+src/libpcre2/doc/html/pcre2convert.html
+src/libpcre2/doc/html/pcre2demo.html
+src/libpcre2/doc/html/pcre2grep.html
+src/libpcre2/doc/html/pcre2jit.html
+src/libpcre2/doc/html/pcre2limits.html
+src/libpcre2/doc/html/pcre2matching.html
+src/libpcre2/doc/html/pcre2partial.html
+src/libpcre2/doc/html/pcre2pattern.html
+src/libpcre2/doc/html/pcre2perform.html
+src/libpcre2/doc/html/pcre2posix.html
+src/libpcre2/doc/html/pcre2sample.html
+src/libpcre2/doc/html/pcre2serialize.html
+src/libpcre2/doc/html/pcre2syntax.html
+src/libpcre2/doc/html/pcre2test.html
+src/libpcre2/doc/html/pcre2unicode.html
+src/libpcre2/m4/ax_pthread.m4
+src/libpcre2/m4/pcre2_visibility.m4
+src/libpcre2/maint/GenerateCommon.py
+src/libpcre2/maint/GenerateTest26.py
+src/libpcre2/maint/GenerateUcd.py
+src/libpcre2/maint/GenerateUcpHeader.py
+src/libpcre2/maint/GenerateUcpTables.py
+src/libpcre2/maint/ManyConfigTests
+src/libpcre2/maint/README
+src/libpcre2/maint/pcre2_chartables.c.non-standard
+src/libpcre2/maint/ucptest.c
+src/libpcre2/maint/utf8.c
+src/libpcre2/maint/Unicode.tables/BidiMirroring.txt
+src/libpcre2/maint/Unicode.tables/CaseFolding.txt
+src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt
+src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt
+src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt
+src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt
+src/libpcre2/maint/Unicode.tables/PropList.txt
+src/libpcre2/maint/Unicode.tables/PropertyAliases.txt
+src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt
+src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt
+src/libpcre2/maint/Unicode.tables/Scripts.txt
+src/libpcre2/maint/Unicode.tables/UnicodeData.txt
+src/libpcre2/maint/Unicode.tables/emoji-data.txt
+src/libpcre2/maint/ucptestdata/testinput1
+src/libpcre2/maint/ucptestdata/testinput2
+src/libpcre2/maint/ucptestdata/testoutput1
+src/libpcre2/maint/ucptestdata/testoutput2
+src/libpcre2/src/config.h.generic
+src/libpcre2/src/config.h.in
+src/libpcre2/src/pcre2.h.generic
+src/libpcre2/src/pcre2.h.in
+src/libpcre2/src/pcre2_auto_possess.c
+src/libpcre2/src/pcre2_chartables.c.dist
+src/libpcre2/src/pcre2_chkdint.c
+src/libpcre2/src/pcre2_compile.c
+src/libpcre2/src/pcre2_config.c
+src/libpcre2/src/pcre2_context.c
+src/libpcre2/src/pcre2_convert.c
+src/libpcre2/src/pcre2_dfa_match.c
+src/libpcre2/src/pcre2_dftables.c
+src/libpcre2/src/pcre2_error.c
+src/libpcre2/src/pcre2_extuni.c
+src/libpcre2/src/pcre2_find_bracket.c
+src/libpcre2/src/pcre2_fuzzsupport.c
+src/libpcre2/src/pcre2_internal.h
+src/libpcre2/src/pcre2_intmodedep.h
+src/libpcre2/src/pcre2_jit_compile.c
+src/libpcre2/src/pcre2_jit_match.c
+src/libpcre2/src/pcre2_jit_misc.c
+src/libpcre2/src/pcre2_jit_neon_inc.h
+src/libpcre2/src/pcre2_jit_simd_inc.h
+src/libpcre2/src/pcre2_jit_test.c
+src/libpcre2/src/pcre2_maketables.c
+src/libpcre2/src/pcre2_match.c
+src/libpcre2/src/pcre2_match_data.c
+src/libpcre2/src/pcre2_newline.c
+src/libpcre2/src/pcre2_ord2utf.c
+src/libpcre2/src/pcre2_pattern_info.c
+src/libpcre2/src/pcre2_printint.c
+src/libpcre2/src/pcre2_script_run.c
+src/libpcre2/src/pcre2_serialize.c
+src/libpcre2/src/pcre2_string_utils.c
+src/libpcre2/src/pcre2_study.c
+src/libpcre2/src/pcre2_substitute.c
+src/libpcre2/src/pcre2_substring.c
+src/libpcre2/src/pcre2_tables.c
+src/libpcre2/src/pcre2_ucd.c
+src/libpcre2/src/pcre2_ucp.h
+src/libpcre2/src/pcre2_ucptables.c
+src/libpcre2/src/pcre2_valid_utf.c
+src/libpcre2/src/pcre2_xclass.c
+src/libpcre2/src/pcre2demo.c
+src/libpcre2/src/pcre2grep.c
+src/libpcre2/src/pcre2posix.c
+src/libpcre2/src/pcre2posix.h
+src/libpcre2/src/pcre2posix_test.c
+src/libpcre2/src/pcre2test.c
+src/libpcre2/src/sljit/sljitConfig.h
+src/libpcre2/src/sljit/sljitConfigCPU.h
+src/libpcre2/src/sljit/sljitConfigInternal.h
+src/libpcre2/src/sljit/sljitLir.c
+src/libpcre2/src/sljit/sljitLir.h
+src/libpcre2/src/sljit/sljitNativeARM_32.c
+src/libpcre2/src/sljit/sljitNativeARM_64.c
+src/libpcre2/src/sljit/sljitNativeARM_T2_32.c
+src/libpcre2/src/sljit/sljitNativeLOONGARCH_64.c
+src/libpcre2/src/sljit/sljitNativeMIPS_32.c
+src/libpcre2/src/sljit/sljitNativeMIPS_64.c
+src/libpcre2/src/sljit/sljitNativeMIPS_common.c
+src/libpcre2/src/sljit/sljitNativePPC_32.c
+src/libpcre2/src/sljit/sljitNativePPC_64.c
+src/libpcre2/src/sljit/sljitNativePPC_common.c
+src/libpcre2/src/sljit/sljitNativeRISCV_32.c
+src/libpcre2/src/sljit/sljitNativeRISCV_64.c
+src/libpcre2/src/sljit/sljitNativeRISCV_common.c
+src/libpcre2/src/sljit/sljitNativeS390X.c
+src/libpcre2/src/sljit/sljitNativeX86_32.c
+src/libpcre2/src/sljit/sljitNativeX86_64.c
+src/libpcre2/src/sljit/sljitNativeX86_common.c
+src/libpcre2/src/sljit/sljitUtils.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorApple.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorCore.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorFreeBSD.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorWindows.c
+src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorNetBSD.c
+src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorWindows.c
+src/libpcre2/testdata/grepbinary
+src/libpcre2/testdata/grepfilelist
+src/libpcre2/testdata/grepinput
+src/libpcre2/testdata/grepinput3
+src/libpcre2/testdata/grepinput8
+src/libpcre2/testdata/grepinputC.bz2
+src/libpcre2/testdata/grepinputC.gz
+src/libpcre2/testdata/grepinputM
+src/libpcre2/testdata/grepinputv
+src/libpcre2/testdata/grepinputx
+src/libpcre2/testdata/greplist
+src/libpcre2/testdata/grepnot.bz2
+src/libpcre2/testdata/grepoutput
+src/libpcre2/testdata/grepoutput8
+src/libpcre2/testdata/grepoutputC
+src/libpcre2/testdata/grepoutputCN
+src/libpcre2/testdata/grepoutputCNU
+src/libpcre2/testdata/grepoutputCU
+src/libpcre2/testdata/grepoutputCbz2
+src/libpcre2/testdata/grepoutputCgz
+src/libpcre2/testdata/grepoutputN
+src/libpcre2/testdata/grepoutputUN
+src/libpcre2/testdata/greppatN4
+src/libpcre2/testdata/testbtables
+src/libpcre2/testdata/testinput1
+src/libpcre2/testdata/testinput10
+src/libpcre2/testdata/testinput11
+src/libpcre2/testdata/testinput12
+src/libpcre2/testdata/testinput13
+src/libpcre2/testdata/testinput14
+src/libpcre2/testdata/testinput15
+src/libpcre2/testdata/testinput16
+src/libpcre2/testdata/testinput17
+src/libpcre2/testdata/testinput18
+src/libpcre2/testdata/testinput19
+src/libpcre2/testdata/testinput2
+src/libpcre2/testdata/testinput20
+src/libpcre2/testdata/testinput21
+src/libpcre2/testdata/testinput22
+src/libpcre2/testdata/testinput23
+src/libpcre2/testdata/testinput24
+src/libpcre2/testdata/testinput25
+src/libpcre2/testdata/testinput26
+src/libpcre2/testdata/testinput3
+src/libpcre2/testdata/testinput4
+src/libpcre2/testdata/testinput5
+src/libpcre2/testdata/testinput6
+src/libpcre2/testdata/testinput7
+src/libpcre2/testdata/testinput8
+src/libpcre2/testdata/testinput9
+src/libpcre2/testdata/testinputEBC
+src/libpcre2/testdata/testinputheap
+src/libpcre2/testdata/testoutput1
+src/libpcre2/testdata/testoutput10
+src/libpcre2/testdata/testoutput11-16
+src/libpcre2/testdata/testoutput11-32
+src/libpcre2/testdata/testoutput12-16
+src/libpcre2/testdata/testoutput12-32
+src/libpcre2/testdata/testoutput13
+src/libpcre2/testdata/testoutput14-16
+src/libpcre2/testdata/testoutput14-32
+src/libpcre2/testdata/testoutput14-8
+src/libpcre2/testdata/testoutput15
+src/libpcre2/testdata/testoutput16
+src/libpcre2/testdata/testoutput17
+src/libpcre2/testdata/testoutput18
+src/libpcre2/testdata/testoutput19
+src/libpcre2/testdata/testoutput2
+src/libpcre2/testdata/testoutput20
+src/libpcre2/testdata/testoutput21
+src/libpcre2/testdata/testoutput22-16
+src/libpcre2/testdata/testoutput22-32
+src/libpcre2/testdata/testoutput22-8
+src/libpcre2/testdata/testoutput23
+src/libpcre2/testdata/testoutput24
+src/libpcre2/testdata/testoutput25
+src/libpcre2/testdata/testoutput26
+src/libpcre2/testdata/testoutput3
+src/libpcre2/testdata/testoutput3A
+src/libpcre2/testdata/testoutput3B
+src/libpcre2/testdata/testoutput4
+src/libpcre2/testdata/testoutput5
+src/libpcre2/testdata/testoutput6
+src/libpcre2/testdata/testoutput7
+src/libpcre2/testdata/testoutput8-16-2
+src/libpcre2/testdata/testoutput8-16-3
+src/libpcre2/testdata/testoutput8-16-4
+src/libpcre2/testdata/testoutput8-32-2
+src/libpcre2/testdata/testoutput8-32-3
+src/libpcre2/testdata/testoutput8-32-4
+src/libpcre2/testdata/testoutput8-8-2
+src/libpcre2/testdata/testoutput8-8-3
+src/libpcre2/testdata/testoutput8-8-4
+src/libpcre2/testdata/testoutput9
+src/libpcre2/testdata/testoutputEBC
+src/libpcre2/testdata/testoutputheap-16
+src/libpcre2/testdata/testoutputheap-32
+src/libpcre2/testdata/testoutputheap-8
+src/libpcre2/testdata/valgrind-jit.supp
+src/libpcre2/testdata/wintestinput3
+src/libpcre2/testdata/wintestoutput3
+src/pcre2/CMakeLists.txt
+src/pcre2/__init__.py
+src/pcre2/consts.pxd
+src/pcre2/consts.pyx
+src/pcre2/exceptions.pxd
+src/pcre2/exceptions.pyx
+src/pcre2/libpcre2.pxd
+src/pcre2/match.pxd
+src/pcre2/match.pyx
+src/pcre2/methods.pxd
+src/pcre2/methods.pyx
+src/pcre2/pattern.pxd
+src/pcre2/pattern.pyx
+src/pcre2/scanner.pxd
+src/pcre2/scanner.pyx
+src/pcre2/utils.pxd
+src/pcre2/utils.pyx
+src/pcre2.egg-info/PKG-INFO
+src/pcre2.egg-info/SOURCES.txt
+src/pcre2.egg-info/dependency_links.txt
+src/pcre2.egg-info/top_level.txt
+tests/test_groups.py
+tests/test_match.py
+tests/test_pattern.py
\ No newline at end of file
--- /dev/null
+find_package(Cython MODULE REQUIRED)
+find_package(PythonExtensions MODULE REQUIRED)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# Build Cython with annotations.
+set(CYTHON_ANNOTATE TRUE)
+
+# Macro to add Cython files as modules, configured to build with PCRE2.
+macro(add_pyx_file filename)
+ add_cython_target(${filename} C PY3)
+ add_library(${filename} MODULE ${filename})
+ python_extension_module(${filename})
+
+ target_link_libraries(${filename} pcre2-8-static)
+ target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR})
+ target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS})
+
+ install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2)
+endmacro()
+
+# GLOB pattern is recommended against,
+# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem
+add_pyx_file(consts)
+add_pyx_file(exceptions)
+add_pyx_file(match)
+add_pyx_file(methods)
+add_pyx_file(pattern)
+add_pyx_file(scanner)
+add_pyx_file(utils)
+
+
+# Include .pyx and .pxd files in distribution for use by Cython API.
+install(
+ FILES
+ consts.pxd
+ consts.pyx
+ exceptions.pxd
+ exceptions.pyx
+ libpcre2.pxd
+ match.pxd
+ match.pyx
+ methods.pxd
+ methods.pyx
+ pattern.pxd
+ pattern.pyx
+ scanner.pxd
+ scanner.pyx
+ utils.pxd
+ utils.pyx
+ DESTINATION
+ src/pcre2
+)
\ No newline at end of file
--- /dev/null
+from .methods import compile, findall, match, scan, split, substitute
+from .consts import (
+ __libpcre2_version__,
+ CompileOption,
+ A, I, M, U, S, X
+)
+__version__ = "0.4.0"
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from enum import IntEnum
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+
+
+__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}"
+
+
+class MetaOption(IntEnum):
+ def __repr__(self):
+ return f"<{self.__class__.__name__}.{self._name_}: 0x{self._value_:x}>"
+
+ @classmethod
+ def verify(cls, options):
+ """ Verify a number is composed of options.
+ """
+ tmp = options
+ for opt in cls:
+ tmp ^= (opt & tmp)
+ return tmp == 0
+
+
+ @classmethod
+ def decompose(cls, options):
+ """ Decompose a number into its component options, returning a list of
+ MetaOption enums that are components of the given options. Note that
+ left over bits are ignored, and veracity can not be determined from
+ the result.
+ """
+ return [opt for opt in cls if (opt & options)]
+
+
+class CompileOption(MetaOption):
+ """ Option bits to be used in pattern compilation. See the following PCRE2
+ documentation for a brief overview of the relevant options:
+ http://pcre.org/current/doc/html/pcre2_compile.html
+ """
+
+ ALLOW_EMPTY_CLASS = PCRE2_ALLOW_EMPTY_CLASS
+ ALT_BSUX = PCRE2_ALT_BSUX
+ ALT_CIRCUMFLEX = PCRE2_ALT_CIRCUMFLEX
+ ALT_VERBNAMES = PCRE2_ALT_VERBNAMES
+ ANCHORED = PCRE2_ANCHORED
+ CASELESS = PCRE2_CASELESS
+ DOLLAR_ENDONLY = PCRE2_DOLLAR_ENDONLY
+ DOTALL = PCRE2_DOTALL
+ DUPNAMES = PCRE2_DUPNAMES
+ ENDANCHORED = PCRE2_ENDANCHORED
+ EXTENDED = PCRE2_EXTENDED
+ EXTENDED_MORE = PCRE2_EXTENDED_MORE
+ FIRSTLINE = PCRE2_FIRSTLINE
+ LITERAL = PCRE2_LITERAL
+ MATCH_UNSET_BACKREF = PCRE2_MATCH_UNSET_BACKREF
+ MULTILINE = PCRE2_MULTILINE
+ UCP = PCRE2_UCP
+ UNGREEDY = PCRE2_UNGREEDY
+ UTF = PCRE2_UTF
+
+
+class BsrChar(IntEnum):
+ """ Indicator for what character(s) are denoted by `\r`.
+ """
+ UNICODE = PCRE2_BSR_UNICODE
+ ANYCRLF = PCRE2_BSR_ANYCRLF
+
+
+class NewlineChar(IntEnum):
+ """ Indicator for what character(s) denote a newline.
+ """
+ CR = PCRE2_NEWLINE_CR
+ LF = PCRE2_NEWLINE_LF
+ CRLF = PCRE2_NEWLINE_CRLF
+ ANY = PCRE2_NEWLINE_ANY
+ ANYCRLF = PCRE2_NEWLINE_ANYCRLF
+ NUL = PCRE2_NEWLINE_NUL
+
+
+# Shorthands
+A = CompileOption.ANCHORED
+I = CompileOption.CASELESS
+M = CompileOption.MULTILINE
+U = CompileOption.UTF
+S = CompileOption.DOTALL
+X = CompileOption.EXTENDED
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint8_t
+
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+
+
+class LibraryError(Exception):
+ """ Catch all for other PCRE2 errors (e.g. bad option bits).
+ """
+
+ def __init__(self, errorcode, context_msg=""):
+ cdef uint8_t errormsg_buf[120]
+ get_error_message_rc = pcre2_get_error_message(
+ errorcode,
+ errormsg_buf, sizeof(errormsg_buf)
+ )
+
+ # Handle errors in fetching error message.
+ if get_error_message_rc == PCRE2_ERROR_NOMEMORY:
+ raise MemoryError()
+ elif get_error_message_rc < 0:
+ raise LibraryError(
+ get_error_message_rc,
+ context_msg=f"Could not retrieve message for error code {get_error_message_rc}."
+ )
+
+ msg = errormsg_buf.decode("utf-8").capitalize()
+ if context_msg:
+ msg = context_msg + ". " + msg
+
+ super().__init__(msg)
+ self.errorcode = errorcode
+
+
+class CompileError(LibraryError):
+ """ Raised when pattern is malformed or is otherwise unable to be
+ compiled.
+ """
+
+ def __init__(self, errorcode, context_msg=""):
+ if not (errorcode > 0):
+ raise ValueError("Compilation error codes are strictly positive")
+
+ super().__init__(errorcode, context_msg=context_msg)
+
+
+class MatchError(LibraryError):
+ """ Raised when no or partial match found.
+ """
+
+ def __init__(self, errorcode, context_msg=""):
+ if not (errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL):
+ raise ValueError(
+ f"Invalid error code '{errorcode}'. "
+ "Match error codes can only be of value PCRE2_ERROR_NOMATCH or PCRE2_ERROR_PARTIAL"
+ )
+
+ super().__init__(errorcode, context_msg=context_msg)
--- /dev/null
+# -*- coding:utf-8 -*-
+
+from libc.stdint cimport uint8_t, uint32_t, int32_t
+
+
+cdef extern from "pcre2.h":
+ cdef unsigned int PCRE2_MAJOR
+ cdef unsigned int PCRE2_MINOR
+
+ # The following option bits can be passed to pcre2_compile(),
+ # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the
+ # function to which it is passed. Put these bits at the most significant
+ # end of the options word so others can be added next to them.
+ cdef unsigned int PCRE2_ANCHORED
+ cdef unsigned int PCRE2_NO_UTF_CHECK
+ cdef unsigned int PCRE2_ENDANCHORED
+
+ # The following option bits can be passed only to pcre2_compile(). However,
+ # they may affect compilation, JIT compilation, and/or interpretive
+ # execution. The following tags indicate which:
+ # C alters what is compiled by pcre2_compile()
+ # J alters what is compiled by pcre2_jit_compile()
+ # M is inspected during pcre2_match() execution
+ # D is inspected during pcre2_dfa_match() execution
+ cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS # C
+ cdef unsigned int PCRE2_ALT_BSUX # C
+ cdef unsigned int PCRE2_AUTO_CALLOUT # C
+ cdef unsigned int PCRE2_CASELESS # C
+ cdef unsigned int PCRE2_DOLLAR_ENDONLY # J M D
+ cdef unsigned int PCRE2_DOTALL # C
+ cdef unsigned int PCRE2_DUPNAMES # C
+ cdef unsigned int PCRE2_EXTENDED # C
+ cdef unsigned int PCRE2_FIRSTLINE # J M D
+ cdef unsigned int PCRE2_MATCH_UNSET_BACKREF # C J M
+ cdef unsigned int PCRE2_MULTILINE # C
+ cdef unsigned int PCRE2_NEVER_UCP # C
+ cdef unsigned int PCRE2_NEVER_UTF # C
+ cdef unsigned int PCRE2_NO_AUTO_CAPTURE # C
+ cdef unsigned int PCRE2_NO_AUTO_POSSESS # C
+ cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR # C
+ cdef unsigned int PCRE2_NO_START_OPTIMIZE # J M D
+ cdef unsigned int PCRE2_UCP # C J M D
+ cdef unsigned int PCRE2_UNGREEDY # C
+ cdef unsigned int PCRE2_UTF # C J M D
+ cdef unsigned int PCRE2_NEVER_BACKSLASH_C # C
+ cdef unsigned int PCRE2_ALT_CIRCUMFLEX # J M D
+ cdef unsigned int PCRE2_ALT_VERBNAMES # C
+ cdef unsigned int PCRE2_USE_OFFSET_LIMIT # J M D
+ cdef unsigned int PCRE2_EXTENDED_MORE # C
+ cdef unsigned int PCRE2_LITERAL # C
+ cdef unsigned int PCRE2_MATCH_INVALID_UTF # J M D
+
+ # An additional compile options word is available in the compile context.
+ cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES # C
+ cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL # C
+ cdef unsigned int PCRE2_EXTRA_MATCH_WORD # C
+ cdef unsigned int PCRE2_EXTRA_MATCH_LINE # C
+ cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF # C
+ cdef unsigned int PCRE2_EXTRA_ALT_BSUX # C
+ cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK # C
+
+ # These are for pcre2_jit_compile().
+ cdef unsigned int PCRE2_JIT_COMPLETE # For full matching.
+ cdef unsigned int PCRE2_JIT_PARTIAL_SOFT
+ cdef unsigned int PCRE2_JIT_PARTIAL_HARD
+ cdef unsigned int PCRE2_JIT_INVALID_UTF
+
+ # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
+ # pcre2_substitute(). Some are allowed only for one of the functions, and
+ # in these cases it is noted below. Note that PCRE2_ANCHORED,
+ # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
+ # functions (though pcre2_jit_match() ignores the latter since it bypasses
+ # all sanity checks).
+ cdef unsigned int PCRE2_NOTBOL
+ cdef unsigned int PCRE2_NOTEOL
+ cdef unsigned int PCRE2_NOTEMPTY # ) These two must be kept
+ cdef unsigned int PCRE2_NOTEMPTY_ATSTART # ) adjacent to each other.
+ cdef unsigned int PCRE2_PARTIAL_SOFT
+ cdef unsigned int PCRE2_PARTIAL_HARD
+ cdef unsigned int PCRE2_DFA_RESTART # pcre2_dfa_match() only
+ cdef unsigned int PCRE2_DFA_SHORTEST # pcre2_dfa_match() only
+ cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH # pcre2_substitute() only
+ cdef unsigned int PCRE2_NO_JIT # Not for pcre2_dfa_match()
+ cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT
+ cdef unsigned int PCRE2_SUBSTITUTE_LITERAL # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_MATCHED # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY # pcre2_substitute() only
+
+ # Options for pcre2_pattern_convert().
+ cdef unsigned int PCRE2_CONVERT_UTF
+ cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK
+ cdef unsigned int PCRE2_CONVERT_POSIX_BASIC
+ cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED
+ cdef unsigned int PCRE2_CONVERT_GLOB
+ cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
+ cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR
+
+ # Newline and \R settings, for use in compile contexts. The newline values
+ # must be kept in step with values set in config.h and both sets must all
+ # be greater than zero.
+ cdef int PCRE2_NEWLINE_CR
+ cdef int PCRE2_NEWLINE_LF
+ cdef int PCRE2_NEWLINE_CRLF
+ cdef int PCRE2_NEWLINE_ANY
+ cdef int PCRE2_NEWLINE_ANYCRLF
+ cdef int PCRE2_NEWLINE_NUL
+
+ cdef int PCRE2_BSR_UNICODE
+ cdef int PCRE2_BSR_ANYCRLF
+
+ # Error codes for pcre2_compile(). Some of these are also used by
+ # pcre2_pattern_convert().
+ cdef int PCRE2_ERROR_END_BACKSLASH
+ cdef int PCRE2_ERROR_END_BACKSLASH_C
+ cdef int PCRE2_ERROR_UNKNOWN_ESCAPE
+ cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER
+ cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET
+ cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS
+ cdef int PCRE2_ERROR_CLASS_RANGE_ORDER
+ cdef int PCRE2_ERROR_QUANTIFIER_INVALID
+ cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT
+ cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY
+ cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS
+ cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING
+ cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS
+ cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE
+ cdef int PCRE2_ERROR_NULL_PATTERN
+ cdef int PCRE2_ERROR_BAD_OPTIONS
+ cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING
+ cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP
+ cdef int PCRE2_ERROR_PATTERN_TOO_LARGE
+ cdef int PCRE2_ERROR_HEAP_FAILED
+ cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS
+ cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW
+ cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING
+ cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH
+ cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE
+ cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES
+ cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED
+ cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE
+ cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS
+ cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR
+ cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED
+ cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK
+ cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG
+ cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED
+ cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C
+ cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE
+ cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING
+ cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB
+ cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P
+ cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR
+ cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME
+ cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME
+ cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE
+ cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY
+ cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY
+ cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG
+ cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS
+ cdef int PCRE2_ERROR_CLASS_INVALID_RANGE
+ cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG
+ cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE
+ cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN
+ cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES
+ cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE
+ cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE
+ cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX
+ cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING
+ # Error 159 is obsolete and should now never occur
+ cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED
+ cdef int PCRE2_ERROR_VERB_UNKNOWN
+ cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG
+ cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED
+ cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW
+ cdef int PCRE2_ERROR_INVALID_OCTAL
+ cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH
+ cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT
+ cdef int PCRE2_ERROR_INVALID_HEXADECIMAL
+ cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX
+ cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS
+ cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS
+ cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG
+ cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT
+ cdef int PCRE2_ERROR_UTF_IS_DISABLED
+ cdef int PCRE2_ERROR_UCP_IS_DISABLED
+ cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG
+ cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS
+ cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS
+ cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER
+ cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER
+ cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED
+ cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP
+ cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED
+ cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED
+ cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG
+ cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP
+ cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16
+ cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS
+ cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE
+ cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS
+ cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN
+ cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE
+ cdef int PCRE2_ERROR_TOO_MANY_CAPTURES
+ cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED
+ cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND
+
+ # "Expected" matching error codes: no match and partial match.
+ cdef int PCRE2_ERROR_NOMATCH
+ cdef int PCRE2_ERROR_PARTIAL
+
+ # Error codes for UTF-8 validity checks.
+ cdef int PCRE2_ERROR_UTF8_ERR1
+ cdef int PCRE2_ERROR_UTF8_ERR2
+ cdef int PCRE2_ERROR_UTF8_ERR3
+ cdef int PCRE2_ERROR_UTF8_ERR4
+ cdef int PCRE2_ERROR_UTF8_ERR5
+ cdef int PCRE2_ERROR_UTF8_ERR6
+ cdef int PCRE2_ERROR_UTF8_ERR7
+ cdef int PCRE2_ERROR_UTF8_ERR8
+ cdef int PCRE2_ERROR_UTF8_ERR9
+ cdef int PCRE2_ERROR_UTF8_ERR10
+ cdef int PCRE2_ERROR_UTF8_ERR11
+ cdef int PCRE2_ERROR_UTF8_ERR12
+ cdef int PCRE2_ERROR_UTF8_ERR13
+ cdef int PCRE2_ERROR_UTF8_ERR14
+ cdef int PCRE2_ERROR_UTF8_ERR15
+ cdef int PCRE2_ERROR_UTF8_ERR16
+ cdef int PCRE2_ERROR_UTF8_ERR17
+ cdef int PCRE2_ERROR_UTF8_ERR18
+ cdef int PCRE2_ERROR_UTF8_ERR19
+ cdef int PCRE2_ERROR_UTF8_ERR20
+ cdef int PCRE2_ERROR_UTF8_ERR21
+
+ # Error codes for UTF-16 validity checks.
+ cdef int PCRE2_ERROR_UTF16_ERR1
+ cdef int PCRE2_ERROR_UTF16_ERR2
+ cdef int PCRE2_ERROR_UTF16_ERR3
+
+ # Error codes for UTF-32 validity checks.
+ cdef int PCRE2_ERROR_UTF32_ERR1
+ cdef int PCRE2_ERROR_UTF32_ERR2
+
+ # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction
+ # functions, context functions, and serializing functions. They are in
+ # numerical order. Originally they were in alphabetical order too, but now
+ # that PCRE2 is released, the numbers must not be changed.
+ cdef int PCRE2_ERROR_BADDATA
+ cdef int PCRE2_ERROR_MIXEDTABLES # Name was changed.
+ cdef int PCRE2_ERROR_BADMAGIC
+ cdef int PCRE2_ERROR_BADMODE
+ cdef int PCRE2_ERROR_BADOFFSET
+ cdef int PCRE2_ERROR_BADOPTION
+ cdef int PCRE2_ERROR_BADREPLACEMENT
+ cdef int PCRE2_ERROR_BADUTFOFFSET
+ cdef int PCRE2_ERROR_CALLOUT # Never used by PCRE2 itself.
+ cdef int PCRE2_ERROR_DFA_BADRESTART
+ cdef int PCRE2_ERROR_DFA_RECURSE
+ cdef int PCRE2_ERROR_DFA_UCOND
+ cdef int PCRE2_ERROR_DFA_UFUNC
+ cdef int PCRE2_ERROR_DFA_UITEM
+ cdef int PCRE2_ERROR_DFA_WSSIZE
+ cdef int PCRE2_ERROR_INTERNAL
+ cdef int PCRE2_ERROR_JIT_BADOPTION
+ cdef int PCRE2_ERROR_JIT_STACKLIMIT
+ cdef int PCRE2_ERROR_MATCHLIMIT
+ cdef int PCRE2_ERROR_NOMEMORY
+ cdef int PCRE2_ERROR_NOSUBSTRING
+ cdef int PCRE2_ERROR_NOUNIQUESUBSTRING
+ cdef int PCRE2_ERROR_NULL
+ cdef int PCRE2_ERROR_RECURSELOOP
+ cdef int PCRE2_ERROR_DEPTHLIMIT
+ cdef int PCRE2_ERROR_RECURSIONLIMIT # Obsolete synonym.
+ cdef int PCRE2_ERROR_UNAVAILABLE
+ cdef int PCRE2_ERROR_UNSET
+ cdef int PCRE2_ERROR_BADOFFSETLIMIT
+ cdef int PCRE2_ERROR_BADREPESCAPE
+ cdef int PCRE2_ERROR_REPMISSINGBRACE
+ cdef int PCRE2_ERROR_BADSUBSTITUTION
+ cdef int PCRE2_ERROR_BADSUBSPATTERN
+ cdef int PCRE2_ERROR_TOOMANYREPLACE
+ cdef int PCRE2_ERROR_BADSERIALIZEDDATA
+ cdef int PCRE2_ERROR_HEAPLIMIT
+ cdef int PCRE2_ERROR_CONVERT_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_DUPMATCH
+ cdef int PCRE2_ERROR_DFA_UINVALID_UTF
+
+ # Request types for pcre2_pattern_info().
+ cdef int PCRE2_INFO_ALLOPTIONS
+ cdef int PCRE2_INFO_ARGOPTIONS
+ cdef int PCRE2_INFO_BACKREFMAX
+ cdef int PCRE2_INFO_BSR
+ cdef int PCRE2_INFO_CAPTURECOUNT
+ cdef int PCRE2_INFO_FIRSTCODEUNIT
+ cdef int PCRE2_INFO_FIRSTCODETYPE
+ cdef int PCRE2_INFO_FIRSTBITMAP
+ cdef int PCRE2_INFO_HASCRORLF
+ cdef int PCRE2_INFO_JCHANGED
+ cdef int PCRE2_INFO_JITSIZE
+ cdef int PCRE2_INFO_LASTCODEUNIT
+ cdef int PCRE2_INFO_LASTCODETYPE
+ cdef int PCRE2_INFO_MATCHEMPTY
+ cdef int PCRE2_INFO_MATCHLIMIT
+ cdef int PCRE2_INFO_MAXLOOKBEHIND
+ cdef int PCRE2_INFO_MINLENGTH
+ cdef int PCRE2_INFO_NAMECOUNT
+ cdef int PCRE2_INFO_NAMEENTRYSIZE
+ cdef int PCRE2_INFO_NAMETABLE
+ cdef int PCRE2_INFO_NEWLINE
+ cdef int PCRE2_INFO_DEPTHLIMIT
+ cdef int PCRE2_INFO_RECURSIONLIMIT # Obsolete synonym
+ cdef int PCRE2_INFO_SIZE
+ cdef int PCRE2_INFO_HASBACKSLASHC
+ cdef int PCRE2_INFO_FRAMESIZE
+ cdef int PCRE2_INFO_HEAPLIMIT
+ cdef int PCRE2_INFO_EXTRAOPTIONS
+
+ # Request types for pcre2_config().
+ cdef int PCRE2_CONFIG_BSR
+ cdef int PCRE2_CONFIG_JIT
+ cdef int PCRE2_CONFIG_JITTARGET
+ cdef int PCRE2_CONFIG_LINKSIZE
+ cdef int PCRE2_CONFIG_MATCHLIMIT
+ cdef int PCRE2_CONFIG_NEWLINE
+ cdef int PCRE2_CONFIG_PARENSLIMIT
+ cdef int PCRE2_CONFIG_DEPTHLIMIT
+ cdef int PCRE2_CONFIG_RECURSIONLIMIT # Obsolete synonym
+ cdef int PCRE2_CONFIG_STACKRECURSE # Obsolete
+ cdef int PCRE2_CONFIG_UNICODE
+ cdef int PCRE2_CONFIG_UNICODE_VERSION
+ cdef int PCRE2_CONFIG_VERSION
+ cdef int PCRE2_CONFIG_HEAPLIMIT
+ cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C
+ cdef int PCRE2_CONFIG_COMPILED_WIDTHS
+ cdef int PCRE2_CONFIG_TABLES_LENGTH
+
+
+ # Opaque handles for PCRE2 defined structs.
+ ctypedef struct pcre2_code_t "pcre2_code":
+ pass
+ ctypedef struct pcre2_match_data_t "pcre2_match_data":
+ pass
+ ctypedef struct pcre2_general_context_t "pcre2_general_context":
+ pass
+ ctypedef struct pcre2_compile_context_t "pcre2_compile_context":
+ pass
+ ctypedef struct pcre2_match_context_t "pcre2_match_context":
+ pass
+
+ # Basic string definition. Note that this assumes PCRE2 in compiled to
+ # support 8-bit strings.
+ ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR"
+
+
+ # Error handling functions.
+ int pcre2_get_error_message(
+ int errorcode,
+ uint8_t *buffer,
+ size_t bufflen
+ )
+
+ # Pattern compilation functions.
+ pcre2_code_t * pcre2_compile(
+ pcre2_sptr_t pattern,
+ size_t length,
+ uint32_t options,
+ int *errorcode,
+ size_t *erroroffset,
+ pcre2_compile_context_t *ccontext
+ )
+
+ int pcre2_jit_compile(
+ pcre2_code_t *code,
+ uint32_t options
+ )
+
+
+ void pcre2_code_free(pcre2_code_t *code)
+
+ # Information on compiled pattern.
+ int pcre2_pattern_info(
+ const pcre2_code_t *code,
+ uint32_t what,
+ void *where
+ )
+
+ int pcre2_substring_number_from_name(
+ const pcre2_code_t *code,
+ pcre2_sptr_t name
+ )
+
+ # Matching and match data functions.
+ pcre2_match_data_t * pcre2_match_data_create(
+ uint32_t ovecsize,
+ pcre2_general_context_t *gcontext
+ )
+
+ pcre2_match_data_t * pcre2_match_data_create_from_pattern(
+ const pcre2_code_t *code,
+ pcre2_general_context_t *gcontext
+ )
+
+ int pcre2_match(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext
+ )
+ int pcre2_jit_match(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext
+ )
+
+ void pcre2_match_data_free(pcre2_match_data_t *match_data)
+
+ uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data)
+
+ size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data)
+
+ int pcre2_substring_nametable_scan(
+ const pcre2_code_t *code,
+ pcre2_sptr_t name,
+ pcre2_sptr_t *first,
+ pcre2_sptr_t *last
+ )
+
+ # String extraction from match data blocks.
+ int pcre2_substring_length_byname(
+ pcre2_match_data_t *match_data,
+ pcre2_sptr_t name,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_get_byname(
+ pcre2_match_data_t *match_data,
+ pcre2_sptr_t name,
+ uint8_t **bufferptr,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_length_bynumber(
+ pcre2_match_data_t *match_data,
+ uint32_t number,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_get_bynumber(
+ pcre2_match_data_t *match_data,
+ uint32_t number,
+ uint8_t **bufferptr,
+ size_t *bufflen
+ )
+
+ # Substitution.
+ int pcre2_substitute(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext,
+ pcre2_sptr_t replacement,
+ size_t rlength,
+ uint8_t *outputbuffer,
+ size_t *outlengthptr
+ )
+
+ # Serialization.
+ int32_t pcre2_serialize_decode(
+ pcre2_code_t **codes,
+ int32_t number_of_codes,
+ const uint8_t *code_bytes,
+ pcre2_general_context_t *gcontex
+ )
+ int32_t pcre2_serialize_encode(
+ pcre2_code_t **codes,
+ int32_t number_of_codes,
+ uint8_t **serialized_bytes,
+ size_t *serialized_size,
+ pcre2_general_context_t *gcontex
+ )
+ void pcre2_serialize_free(uint8_t *bytes)
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+cdef class Match:
+ cdef pcre2_match_data_t *_mtch
+ cdef Pattern _pattern
+ cdef Py_buffer *_subj
+ cdef size_t _ofst # Byte offset, regardless of subject type.
+ cdef uint32_t _opts
+
+ @staticmethod
+ cdef Match _from_data(
+ pcre2_match_data_t *mtch, Pattern pattern, Py_buffer *subj, size_t ofst, uint32_t opts
+ )
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from enum import IntEnum
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython.unicode cimport PyUnicode_Check
+cimport cython
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+@cython.freelist(8)
+cdef class Match:
+ """
+ Object wrapper for a match block in PCRE2. Contains all relevant
+ information of a successful match. Attributes defined in match.pxd, see
+ below for an overview:
+ _mtch: Raw match data block, managed by PCRE2
+ _pattern: Pattern object used in match
+ _subj: Subject the pattern was matched against
+ _ofst: Byte offset (egardless of subject type) used in match
+ _opts: Option bits used in match call
+ """
+
+ # =================================== #
+ # Lifetime management #
+ # =================================== #
+
+ def __cinit__(self):
+ self._mtch = NULL
+ self._pattern = None
+ self._subj = NULL
+ self._opts = 0
+
+
+ def __init__(self, *args, **kwargs):
+ # Prevent accidental instantiation from normal Python code since we
+ # cannot pass pointers into a Python constructor.
+ module = self.__class__.__module__
+ qualname = self.__class__.__qualname__
+ raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+ def __dealloc__(self):
+ if self._subj is not NULL:
+ free_buffer(self._subj)
+ if self._mtch is not NULL:
+ pcre2_match_data_free(self._mtch)
+
+
+ @staticmethod
+ cdef Match _from_data(
+ pcre2_match_data_t *mtch,
+ Pattern pattern,
+ Py_buffer *subj,
+ size_t ofst,
+ uint32_t opts):
+ """ Factory function to create Match objects from C-type fields. The
+ ownership of the given pointers are stolen, which causes the extension
+ type to free them when the object is deallocated.
+ """
+
+ # Fast call to __new__() that bypasses the __init__() constructor.
+ cdef Match match = Match.__new__(Match)
+ match._mtch = mtch
+ match._pattern = pattern
+ match._subj = subj
+ match._ofst = ofst # Code unit offset
+ match._opts = opts
+ return match
+
+
+ # ========================== #
+ # Properties #
+ # ========================== #
+
+ @property
+ def options(self):
+ return self._opts
+
+
+ @property
+ def subject(self):
+ return self._subj.obj
+
+
+ @property
+ def pattern(self):
+ return self._pattern
+
+
+ # ======================= #
+ # Methods #
+ # ======================= #
+
+ def start(self, group=0):
+ """ Get the starting index of the matched substring, or of a specified
+ captured group.
+ """
+ ovec_count = pcre2_get_ovector_count(self._mtch)
+ ovec_table = pcre2_get_ovector_pointer(self._mtch)
+
+ cdef int grp_num
+ cdef pcre2_sptr_t first_entry
+ cdef pcre2_sptr_t last_entry
+ if isinstance(group, int):
+ grp_num = group
+ else:
+ grp_name = get_buffer(group)
+ pcre2_substring_nametable_scan(
+ self._pattern._code, <pcre2_sptr_t>grp_name.buf, &first_entry, &last_entry
+ )
+ grp_num = (first_entry[0] << 8) | first_entry[1]
+ if grp_num < 0:
+ raise_from_rc(grp_num, None)
+ free_buffer(grp_name)
+
+ if grp_num > <int>ovec_count:
+ raise ValueError("Group referenced out of bounds")
+ start = ovec_table[2 * grp_num]
+
+ # Convert to code unit index as necessary.
+ if PyUnicode_Check(self._subj.obj):
+ _, start = codeunit_to_codepoint(self._subj, start, 0, 0)
+
+ return start
+
+
+ def end(self, group=0):
+ """ Get the ending index of the matched substring, or of a specified
+ captured group.
+ """
+ ovec_count = pcre2_get_ovector_count(self._mtch)
+ ovec_table = pcre2_get_ovector_pointer(self._mtch)
+
+ cdef int grp_num
+ cdef pcre2_sptr_t first_entry
+ cdef pcre2_sptr_t last_entry
+ if isinstance(group, int):
+ grp_num = group
+ else:
+ grp_name = get_buffer(group)
+ pcre2_substring_nametable_scan(
+ self._pattern._code, <pcre2_sptr_t>grp_name.buf, &first_entry, &last_entry
+ )
+ grp_num = (first_entry[0] << 8) | first_entry[1]
+ if grp_num < 0:
+ raise_from_rc(grp_num, None)
+ free_buffer(grp_name)
+
+ if grp_num > <int>ovec_count:
+ raise ValueError("Group referenced out of bounds.")
+ end = ovec_table[2 * grp_num + 1]
+
+ # Convert to code unit index as necessary.
+ if PyUnicode_Check(self._subj.obj):
+ _, end = codeunit_to_codepoint(self._subj, end, 0, 0)
+
+ return end
+
+
+ def substring(self, group=0, default=""):
+ """ Get the full matched substring, or that of a specified captured
+ group.
+ """
+ cdef uint8_t *res
+ cdef size_t res_len
+ if isinstance(group, int):
+ grp_num = <uint32_t>group
+
+ # Handle unset matches and return default if none found
+ is_substring_set = pcre2_substring_length_bynumber(self._mtch, grp_num, NULL)
+ if is_substring_set < 0:
+ return default
+
+ get_rc = pcre2_substring_get_bynumber(self._mtch, grp_num, &res, &res_len)
+ if get_rc < 0:
+ raise_from_rc(get_rc, None)
+ else:
+ grp_name = get_buffer(group)
+
+ # Handle unset matches and return default if none found
+ is_substring_set = pcre2_substring_length_byname(
+ self._mtch, <pcre2_sptr_t>grp_name.buf, NULL
+ )
+ if is_substring_set < 0:
+ return default
+
+ get_rc = pcre2_substring_get_byname(
+ self._mtch, <pcre2_sptr_t>grp_name.buf, &res, &res_len
+ )
+ if get_rc < 0:
+ raise_from_rc(get_rc, None)
+ free_buffer(grp_name)
+
+ # Clean up result and convert to unicode as appropriate.
+ result = (<pcre2_sptr_t>res)[:res_len]
+ result = result.strip(b"\x00")
+ if PyUnicode_Check(self._subj.obj):
+ result = result.decode("utf-8")
+
+ return result
+
+
+ def __getitem__(self, group):
+ """ Alias to substring.
+ """
+ return self.substring(group)
+
+
+ def expand(self, replacement, offset=0, options=0, low_memory=False):
+ """ Equivalent to calling substitute with the provided match. The type
+ of the subject determines the type of the returned string.
+ """
+ is_subj_utf = <bint>PyUnicode_Check(self._subj.obj)
+ is_repl_utf = <bint>PyUnicode_Check(replacement)
+ if is_subj_utf ^ is_repl_utf:
+ subj_type = "string" if is_subj_utf else "bytes-like"
+ repl_type = "string" if is_repl_utf else "bytes-like"
+ raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement")
+
+ # Convert Python objects to C strings.
+ repl = get_buffer(replacement)
+ cdef size_t obj_ofst = <size_t>offset
+ cdef size_t ofst = obj_ofst
+ cdef uint32_t opts = <uint32_t>options | PCRE2_SUBSTITUTE_MATCHED
+ if is_subj_utf:
+ ofst, obj_ofst = codepoint_to_codeunit(self._subj, obj_ofst, 0, 0)
+
+ cdef size_t res_buf_len = 0
+ if not low_memory:
+ res_buf_len = self._subj.len + (self._subj.len // 2)
+
+ cdef int rc = 0
+ res, res_len = Pattern._substitute(
+ self._pattern._code, repl, self._subj, res_buf_len, ofst, opts, self._mtch, &rc
+ )
+ if res is NULL:
+ raise_from_rc(rc, None)
+
+ # Clean up result and convert to unicode as appropriate.
+ result = (<pcre2_sptr_t>res)[:res_len]
+ result = result.strip(b"\x00")
+ if is_subj_utf:
+ result = result.decode("utf-8")
+
+ free(res)
+ free_buffer(repl)
+ return result
+
+ def groups(self, default=""):
+ """ Return a tuple containing all the subgroups of the match, from 1 up to however many
+ groups are in the pattern.
+ """
+ return tuple(self.substring(g, default=default) for g in range(self.pattern.capture_count))
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from cpython cimport Py_buffer
+from cpython.unicode cimport PyUnicode_Check
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+from .match cimport Match
+
+
+def compile(pattern, options=0, jit=False):
+ """ Factory function to compile regular expressions into Pattern objects.
+ See the following PCRE2 documentation for a brief overview of the relevant
+ options:
+ http://pcre.org/current/doc/html/pcre2_compile.html
+ """
+
+ cdef Py_buffer *patn = get_buffer(pattern)
+ cdef uint32_t opts = <uint32_t>options
+
+ # Ensure unicode strings are processed with UTF-8 support.
+ if PyUnicode_Check(pattern):
+ opts = opts | PCRE2_UTF
+
+ cdef int compile_rc
+ cdef size_t compile_errpos
+ cdef pcre2_code_t *code = pcre2_compile(
+ <pcre2_sptr_t>patn.buf, <size_t>patn.len, opts, &compile_rc, &compile_errpos, NULL
+ )
+
+ if code is NULL:
+ # If source was a unicode string, use the code point offset.
+ if PyUnicode_Check(pattern):
+ _, compile_errpos = codeunit_to_codepoint(patn, compile_errpos, 0, 0)
+ additional_msg = f"Compilation failed at position {compile_errpos!r}"
+ raise_from_rc(compile_rc, additional_msg)
+
+ pattern_obj = Pattern._from_data(code, patn, opts)
+ if jit:
+ pattern_obj.jit_compile()
+ return pattern_obj
+
+
+def findall(pattern, subject, offset=0, options=0, jit=True):
+ """ Shorthand for compiling a pattern, then calling findall. Note that this
+ will use JIT compilation.
+ """
+ return compile(pattern, options=options, jit=jit).findall(subject, offset=offset)
+
+
+def match(pattern, subject, offset=0, options=0, jit=False):
+ """ Shorthand for compiling a pattern, then calling match.
+ """
+ return compile(pattern, options=options, jit=jit).match(subject, offset=offset)
+
+
+def scan(pattern, subject, offset=0, options=0, jit=True):
+ """ Shorthand for compiling a pattern, then calling scan. Note that this
+ will use JIT compilation.
+ """
+ return compile(pattern, options=options, jit=jit).scan(subject, offset=offset)
+
+
+def split(pattern, subject, maxsplit=0, offset=0, options=0, jit=True):
+ """ Shorthand for compiling a pattern, then calling split. Note that this
+ will use JIT compilation.
+ """
+ pattern_obj = compile(pattern, options=options, jit=jit)
+ return pattern_obj.split(subject, maxsplit=maxsplit, offset=offset)
+
+
+def substitute(
+ pattern,
+ replacement,
+ subject,
+ offset=0,
+ suball=True,
+ literal=False,
+ low_memory=False,
+ options=0,
+ jit=True
+):
+ """ Shorthand for compiling a pattern, then calling substitute.
+ """
+ pattern_obj = compile(pattern, options=options, jit=jit)
+ if suball:
+ pattern_obj.jit_compile()
+ return pattern_obj.substitute(
+ replacement, subject, offset=offset, suball=suball, literal=literal, low_memory=low_memory
+ )
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+
+
+cdef class Pattern:
+ cdef pcre2_code_t *_code
+ cdef Py_buffer *_patn
+ cdef uint32_t _opts
+ cdef bint _jitc
+
+ @staticmethod
+ cdef Pattern _from_data(
+ pcre2_code_t *code, Py_buffer *patn, uint32_t opts
+ )
+
+ @staticmethod
+ cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except *
+ @staticmethod
+ cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except *
+ @staticmethod
+ cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except *
+
+ @staticmethod
+ cdef pcre2_match_data_t * _match(
+ pcre2_code_t *code, Py_buffer *subj, size_t ofst, uint32_t opts, int *rc
+ )
+
+ @staticmethod
+ cdef (uint8_t *, size_t) _substitute(
+ pcre2_code_t *code, Py_buffer *repl, Py_buffer *subj, size_t res_buf_len,
+ size_t ofst, uint32_t opts, pcre2_match_data_t *mtch, int *rc
+ )
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython cimport Py_buffer
+from cpython cimport array
+from cpython.unicode cimport PyUnicode_Check
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .match cimport Match
+from .scanner cimport Scanner
+from .consts import BsrChar, NewlineChar
+
+
+def _rebuild(pattern, code_bytes_obj, options):
+ """ Deserializes code object to allow for unpickling.
+ """
+ patn = get_buffer(pattern)
+ opts = <uint32_t>options
+ code_buf = get_buffer(code_bytes_obj)
+
+ cdef pcre2_code_t *code
+ number_of_codes = pcre2_serialize_decode(&code, 1, <const uint8_t *>code_buf.buf, NULL)
+ if number_of_codes < 0:
+ raise_from_rc(number_of_codes, None)
+
+ return Pattern._from_data(code, patn, opts)
+
+
+cdef class Pattern:
+ """
+ Object wrapper for a compiled pattern (known as a code struct) in PCRE2.
+ Attributes defined in pattern.pxd, see below for an overview:
+ _code: Raw compiled pattern, managed by PCRE2
+ _patn: Python object passed to compile
+ _opts: Option bits passed to compile call
+ _jitc: Indicator if pattern was JIT compiled
+ """
+
+ # =================================== #
+ # Lifetime management #
+ # =================================== #
+
+ def __cinit__(self):
+ self._code = NULL
+ self._patn = NULL
+ self._opts = 0
+ self._jitc = False
+
+
+ def __init__(self, *args, **kwargs):
+ # Prevent accidental instantiation from normal Python code since we
+ # cannot pass pointers into a Python constructor.
+ module = self.__class__.__module__
+ qualname = self.__class__.__qualname__
+ raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+ def __dealloc__(self):
+ if self._patn is not NULL:
+ free_buffer(self._patn)
+ if self._code is not NULL:
+ pcre2_code_free(self._code)
+
+
+ @staticmethod
+ cdef Pattern _from_data(pcre2_code_t *code, Py_buffer *patn, uint32_t opts):
+ """ Factory function to create Pattern objects from C-type fields. The
+ ownership of the given pointers are stolen, which causes the extension
+ type to free them when the object is deallocated.
+ """
+ # Fast call to __new__() that bypasses the __init__() constructor.
+ cdef Pattern pattern = Pattern.__new__(Pattern)
+ pattern._code = code
+ pattern._patn = patn
+ pattern._opts = opts
+ return pattern
+
+
+ # ========================================= #
+ # Serialize and deserialize #
+ # ========================================= #
+
+ def __reduce__(self):
+ """ Serializes code object to allow for pickling.
+ """
+ cdef uint8_t *code_bytes
+ cdef size_t code_count
+ serialize_rc = pcre2_serialize_encode(
+ <const pcre2_code_t **>&self._code, 1, &code_bytes, &code_count, NULL
+ )
+ if serialize_rc < 0:
+ raise_from_rc(serialize_rc, None)
+
+ return (_rebuild, (self._patn.obj, code_bytes[:code_count], self._opts))
+
+
+ # =================================== #
+ # Pattern information #
+ # =================================== #
+
+ @staticmethod
+ cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except *:
+ """ Safely access pattern info returned as uint32_t.
+ """
+ cdef uint32_t where
+ pattern_info_rc = pcre2_pattern_info(code, what, &where)
+ if pattern_info_rc < 0:
+ raise_from_rc(pattern_info_rc, None)
+ return where
+
+ @staticmethod
+ cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except *:
+ """ Safely access pattern info returned as size_t.
+ """
+ cdef size_t where
+ pattern_info_rc = pcre2_pattern_info(code, what, &where)
+ if pattern_info_rc < 0:
+ raise_from_rc(pattern_info_rc, None)
+ return where
+
+ @staticmethod
+ cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except *:
+ """ Safely access pattern info returned as bint.
+ """
+ cdef bint where
+ pattern_info_rc = pcre2_pattern_info(code, what, &where)
+ if pattern_info_rc < 0:
+ raise_from_rc(pattern_info_rc, None)
+ return where
+
+
+ @property
+ def pattern(self):
+ """ Return the pattern the object was compiled with.
+ """
+ return self._patn.obj
+
+
+ @property
+ def options(self):
+ """ Returns the compile options as modified by any top-level (*XXX)
+ option settings such as (*UTF) at the start of the pattern itself.
+ """
+ return Pattern._info_uint(self._code, PCRE2_INFO_ALLOPTIONS)
+
+
+ @property
+ def backslash_r(self):
+ """ Return an indicator to what character sequences the \R escape
+ sequence matches.
+ """
+ bsr = Pattern._info_uint(self._code, PCRE2_INFO_BSR)
+ return BsrChar(bsr)
+
+
+ @property
+ def capture_count(self):
+ """ Returns the highest capture group number in the pattern. In
+ patterns where `(?|` is not used, this is also the total number of
+ capture groups.
+ """
+ return Pattern._info_uint(self._code, PCRE2_INFO_CAPTURECOUNT)
+
+
+ @property
+ def jit_size(self):
+ """ If the compiled pattern was successfully JIT compiled, return the
+ size of the JIT compiled code, otherwise return zero.
+ """
+ return Pattern._info_size(self._code, PCRE2_INFO_JITSIZE)
+
+ @property
+ def min_length(self):
+ """ Returns the minimum number of characters of matching subject strings.
+ """
+ return Pattern._info_uint(self._code, PCRE2_INFO_MINLENGTH)
+
+
+ @property
+ def name_count(self):
+ """ Returns the number of named capture groups.
+ """
+ return Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT)
+
+
+ @property
+ def newline(self):
+ """ Returns the type of character sequence that will be recognized as
+ a newline while matching.
+ """
+ newline = Pattern._info_uint(self._code, PCRE2_INFO_NEWLINE)
+ return NewlineChar(newline)
+
+
+ @property
+ def size(self):
+ """ Returns the size of the compiled pattern in bytes.
+ """
+ return Pattern._info_size(self._code, PCRE2_INFO_SIZE)
+
+
+ def name_dict(self):
+ """ Returns a mapping from capture group number to capture group name.
+ """
+ # Get name table related information.
+ name_count = Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT)
+ name_entry_size = Pattern._info_uint(self._code, PCRE2_INFO_NAMEENTRYSIZE)
+
+ cdef pcre2_sptr_t name_table
+ pattern_info_rc = pcre2_pattern_info(self._code, PCRE2_INFO_NAMETABLE, &name_table)
+ if pattern_info_rc < 0:
+ raise_from_rc(pattern_info_rc, None)
+
+ # Convert byte table to dictionary.
+ name_dict = {}
+ cdef uint32_t i
+ for i in range(name_count):
+ offset = i * name_entry_size
+
+ # First two bytes of name table contain index, followed by possibly
+ # unicode byte string.
+ entry_idx = int((name_table[offset] << 8) | name_table[offset + 1])
+ entry_name = name_table[offset + 2:offset + name_entry_size]
+
+ # Clean up entry and convert to unicode as appropriate.
+ entry_name = entry_name.strip(b"\x00")
+ if PyUnicode_Check(self._patn.obj):
+ entry_name = entry_name.decode("utf-8")
+
+ name_dict[entry_idx] = entry_name
+
+ return name_dict
+
+
+ # ======================= #
+ # Methods #
+ # ======================= #
+
+ def jit_compile(self):
+ """ JIT compile the pattern.
+ """
+ jit_compile_rc = pcre2_jit_compile(self._code, PCRE2_JIT_COMPLETE)
+ if jit_compile_rc < 0:
+ raise_from_rc(jit_compile_rc, None)
+ self._jitc = True
+
+
+ @staticmethod
+ cdef pcre2_match_data_t * _match(
+ pcre2_code_t *code,
+ Py_buffer *subj,
+ size_t ofst,
+ uint32_t opts,
+ int *rc):
+ """ Safe wrapper around calling PCRE2 function directly.
+ """
+ # Allocate memory for match.
+ mtch = pcre2_match_data_create_from_pattern(code, NULL)
+ if mtch is NULL:
+ rc[0] = PCRE2_ERROR_NOMEMORY
+ return NULL
+
+ # Attempt match of pattern onto subject.
+ rc[0] = pcre2_match(
+ code, <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+ ofst, opts, mtch, NULL
+ )
+ return mtch
+
+
+ def findall(self, subject, offset=0):
+ """
+ Return all non-overlapping matches of our pattern in subject, as a list of strings or tuples.
+
+ The string is scanned left-to-right, and matches are returned in the
+ order found. Empty matches are included in the result.
+
+ The result depends on the number of capturing groups in the pattern.
+ If there are no groups, return a list of strings matching the whole
+ pattern. If there is exactly one group, return a list of strings
+ matching that group. If multiple groups are present, return a list of
+ tuples of strings matching the groups. Non-capturing groups do not
+ affect the form of the result.
+ """
+ matches = self.scan(subject, offset=offset)
+ if self.capture_count == 0:
+ return [m.substring() for m in matches]
+ elif self.capture_count == 1:
+ return [m.substring(1) for m in matches]
+ result = []
+ for m in matches:
+ result.append(tuple(m.substring(g) for g in range(self.capture_count)))
+ return result
+
+
+ def match(self, subject, offset=0):
+ """ If match exists, returns the corresponding Match object. Otherwise
+ a MatchError is thrown in the case of no matches. See the following
+ PCRE2 documentation for a brief overview of the relevant options:
+ http://pcre.org/current/doc/html/pcre2_match.html
+ """
+ cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj)
+ cdef bint is_subj_utf = PyUnicode_Check(subject)
+ if is_patn_utf ^ is_subj_utf:
+ patn_type = "string" if is_patn_utf else "bytes-like"
+ subj_type = "string" if is_subj_utf else "bytes-like"
+ raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+ cdef Py_buffer *subj = get_buffer(subject)
+ cdef size_t obj_ofst = <size_t>offset
+ cdef size_t ofst = obj_ofst
+ cdef uint32_t opts = 0
+
+ # Convert indices accordingly.
+ if is_subj_utf:
+ ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0)
+
+ cdef int match_rc = 0
+ mtch = Pattern._match(self._code, subj, ofst, opts, &match_rc)
+ if match_rc < 0:
+ raise_from_rc(match_rc, None)
+
+ return Match._from_data(mtch, self, subj, ofst, opts)
+
+
+ def scan(self, subject, offset=0):
+ """ Returns iterator over all non-overlapping matches in a subject,
+ yielding Match objects.
+ """
+ cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj)
+ cdef bint is_subj_utf = PyUnicode_Check(subject)
+ if is_patn_utf ^ is_subj_utf:
+ patn_type = "string" if is_patn_utf else "bytes-like"
+ subj_type = "string" if is_subj_utf else "bytes-like"
+ raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+ subj = get_buffer(subject)
+ return Scanner._from_data(self, subj, offset)
+
+
+ def split(self, subject, maxsplit=0, offset=0):
+ """
+ Split subject by occurances of our pattern.
+
+ If capturing parentheses are used in pattern, then the text of all
+ groups in the pattern are also returned as part of the resulting list.
+ If maxsplit is nonzero, at most maxsplit splits occur, and the
+ remainder of the string is returned as the final element of the list.
+
+ If there are capturing groups in the separator and it matches at the
+ start of the string, the result will start with an empty string. The
+ same holds for the end of the string.
+
+ That way, separator components are always found at the same relative
+ indices within the result list.
+
+ Empty matches for the pattern split the string only when not adjacent
+ to a previous empty match.
+ """
+ output = []
+ pos = n = 0
+ for match in self.scan(subject, offset=offset):
+ start = match.start()
+ end = match.end()
+ if start != end:
+ output.append(subject[pos:start])
+ output.extend(match.groups())
+ pos = end
+ n += 1
+ if 0 < maxsplit <= n:
+ break
+ output.append(subject[pos:])
+ return output
+
+
+ @staticmethod
+ cdef (uint8_t *, size_t) _substitute(
+ pcre2_code_t *code,
+ Py_buffer *repl,
+ Py_buffer *subj,
+ size_t res_buf_len,
+ size_t ofst,
+ uint32_t opts,
+ pcre2_match_data_t *mtch,
+ int *rc):
+ """ Safe wrapper around calling PCRE2 function directly.
+ """
+ cdef size_t res_len = res_buf_len
+ cdef uint8_t *res
+ res = <uint8_t *>malloc(res_len * sizeof(uint8_t))
+ substitute_rc = pcre2_substitute(
+ code,
+ <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+ ofst, opts | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, mtch, NULL,
+ <pcre2_sptr_t>repl.buf, <size_t>repl.len,
+ res, &res_len
+ )
+ # Reattempt substitution, now with required size of buffer known.
+ if substitute_rc == PCRE2_ERROR_NOMEMORY:
+ free(res)
+ res = <uint8_t *>malloc(res_len * sizeof(uint8_t))
+ substitute_rc = pcre2_substitute(
+ code,
+ <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+ ofst, opts, mtch, NULL,
+ <pcre2_sptr_t>repl.buf, <size_t>repl.len,
+ res, &res_len
+ )
+ # Capture return codes from both substitute attempts.
+ if substitute_rc < 0:
+ free(res)
+ free_buffer(subj)
+ free_buffer(repl)
+ rc[0] = substitute_rc
+ return NULL, 0
+
+ return res, res_len
+
+
+ def substitute(
+ self, replacement, subject, offset=0, suball=True, literal=False, low_memory=False
+ ):
+ """ Returns the string obtained by replaces matches in subject with a
+ replacement. Note that option bits can significantly change the
+ functions behavior. See the following PCRE2 documentation for a brief
+ overview of the relevant options:
+ http://pcre.org/current/doc/html/pcre2_substitute.html
+ """
+ is_patn_utf = <bint>PyUnicode_Check(self._patn.obj)
+ is_subj_utf = <bint>PyUnicode_Check(subject)
+ is_repl_utf = <bint>PyUnicode_Check(replacement)
+ if is_subj_utf ^ is_repl_utf:
+ subj_type = "string" if is_subj_utf else "bytes-like"
+ repl_type = "string" if is_repl_utf else "bytes-like"
+ raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement")
+ if is_patn_utf ^ is_subj_utf:
+ patn_type = "string" if is_patn_utf else "bytes-like"
+ subj_type = "string" if is_subj_utf else "bytes-like"
+ raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+ # Convert Python objects to C types.
+ subj = get_buffer(subject)
+ repl = get_buffer(replacement)
+ cdef size_t obj_ofst = <size_t>offset
+ cdef size_t ofst = obj_ofst
+
+ # Fill options from flags
+ cdef uint32_t opts = 0
+ if suball:
+ opts = opts | PCRE2_SUBSTITUTE_GLOBAL
+ if literal:
+ opts = opts | PCRE2_SUBSTITUTE_LITERAL
+
+ # Always replace unmatched groups with an empty string to match behavior of re
+ opts = opts | PCRE2_SUBSTITUTE_UNSET_EMPTY
+
+ if is_subj_utf:
+ ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0)
+
+ cdef size_t res_buf_len = 0
+ if not low_memory:
+ res_buf_len = subj.len + (subj.len // 2)
+
+ cdef int rc = 0
+ res, res_len = Pattern._substitute(
+ self._code, repl, subj, res_buf_len, ofst, opts, NULL, &rc
+ )
+ if res is NULL:
+ raise_from_rc(rc, None)
+
+ # Clean up result and convert to unicode as appropriate.
+ result = (<pcre2_sptr_t>res)[:res_len]
+ result = result.strip(b"\x00")
+ if is_subj_utf:
+ result = result.decode("utf-8")
+
+ free(res)
+ free_buffer(subj)
+ free_buffer(repl)
+ return result
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+cdef class Scanner:
+ cdef Pattern _pattern
+ cdef Py_buffer *_subj
+
+ cdef bint _is_crlf_newline
+ cdef bint _is_patn_utf
+
+ cdef uint32_t _state_opts
+ cdef size_t _state_ofst
+ cdef size_t _state_obj_ofst
+
+ @staticmethod
+ cdef Scanner _from_data(
+ Pattern pattern, Py_buffer *subject, size_t offset
+ )
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython cimport Py_buffer
+from cpython cimport array
+from cpython.unicode cimport PyUnicode_Check
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .match cimport Match
+from .pattern cimport Pattern
+from .consts import BsrChar, NewlineChar
+
+
+cdef class Scanner:
+ """ Iterator object that scans a subject all non-overlapping matches of a
+ pattern. Attributes defined in scanner.pxd, see below for an overview:
+ _pattern: Pattern object to use for matching
+ _subj: Subject to scan
+ _is_crlf_newline: Whether the character sequence CRLF denotes a newline
+ _is_patn_utf: Whether the pattern was compiled with UTF support
+ _state_opts: Options to pass to match
+ _state_ofst: Byte offset to match at
+ _state_obj_ofst: Object offset to match at
+ """
+
+
+ # =================================== #
+ # Lifetime management #
+ # =================================== #
+
+ def __cinit__(self):
+ self._pattern = None
+ self._subj = NULL
+
+ self._is_patn_utf = False
+ self._is_crlf_newline = False
+
+ self._state_opts = 0
+ self._state_ofst = 0
+ self._state_obj_ofst = 0
+
+
+ def __init__(self, *args, **kwargs):
+ # Prevent accidental instantiation from normal Python code since we
+ # cannot pass pointers into a Python constructor.
+ module = self.__class__.__module__
+ qualname = self.__class__.__qualname__
+ raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+ def __dealloc__(self):
+ if self._subj is not NULL:
+ free_buffer(self._subj)
+
+
+ @staticmethod
+ cdef Scanner _from_data(Pattern pattern, Py_buffer *subj, size_t offset):
+ """ Factory function to create Scanner objects from C-type fields. The
+ ownership of the given pointers are stolen, which causes the extension
+ type to free them when the object is deallocated.
+ """
+ # Fast call to __new__() that bypasses the __init__() constructor.
+ cdef Scanner scanner = Scanner.__new__(Scanner)
+ scanner._pattern = pattern
+ scanner._subj = subj
+
+ patn_opts = Pattern._info_uint(pattern._code, PCRE2_INFO_ALLOPTIONS)
+ scanner._is_patn_utf = (patn_opts & PCRE2_UTF) != 0
+ newline = Pattern._info_uint(pattern._code, PCRE2_INFO_NEWLINE)
+ scanner._is_crlf_newline = (
+ newline == PCRE2_NEWLINE_ANY or
+ newline == PCRE2_NEWLINE_CRLF or
+ newline == PCRE2_NEWLINE_ANYCRLF
+ )
+ scanner._state_opts = 0
+
+ # Compute and set byte equivalent offset.
+ if scanner._is_patn_utf:
+ ofst, obj_ofst = codepoint_to_codeunit(scanner._subj, offset, 0, 0)
+ scanner._state_ofst = ofst
+ scanner._state_obj_ofst = obj_ofst
+ else:
+ scanner._state_obj_ofst = offset
+ scanner._state_ofst = scanner._state_obj_ofst
+ return scanner
+
+
+ # ======================================== #
+ # Iteration implementation #
+ # ======================================== #
+
+ def __iter__(self):
+ return self
+
+
+ def __next__(self):
+ """ Yields next match object found in subject.
+ """
+ if self._state_obj_ofst > self._subj.len:
+ raise StopIteration
+
+ # Attempt match of pattern onto subject.
+ match_rc = <int>0
+ mtch = Pattern._match(
+ self._pattern._code, self._subj, self._state_ofst, self._state_opts, &match_rc
+ )
+
+ # Handle no matches in result.
+ if match_rc == PCRE2_ERROR_NOMATCH:
+ # Default match is not achored so if no match found at current offset, then there
+ # will not be any ahead either.
+ if self._state_opts == 0:
+ pcre2_match_data_free(mtch)
+ raise StopIteration
+
+ # Reset options so empty strings can match at next offset.
+ self._state_opts = 0
+
+ # Increment to next character and handle possible CRLF newlines.
+ obj_ofst_increment = 1
+ if self._is_crlf_newline and (self._state_ofst + 1) < self._subj.len:
+ if (<bytes>self._subj.buf)[self._state_ofst:self._state_ofst + 2] == b"\r\n":
+ obj_ofst_increment += 1
+
+ # Convert indices accordingly.
+ if self._is_patn_utf:
+ self._state_ofst, self._state_obj_ofst = codepoint_to_codeunit(
+ self._subj,
+ self._state_obj_ofst + obj_ofst_increment,
+ self._state_ofst,
+ self._state_obj_ofst
+ )
+ else:
+ self._state_obj_ofst = self._state_obj_ofst + obj_ofst_increment
+ self._state_ofst = self._state_obj_ofst
+
+ pcre2_match_data_free(mtch)
+ return self.__next__()
+
+ # Handle all other errors.
+ elif mtch is NULL or match_rc < 0:
+ pcre2_match_data_free(mtch)
+ raise_from_rc(match_rc, None)
+
+ # If the match was successful.
+ else:
+ ovec_table = pcre2_get_ovector_pointer(mtch)
+ mtch_end = ovec_table[1]
+
+ if self._state_ofst == mtch_end:
+ # If the matched string is empty ensure next is not.
+ self._state_opts = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED
+ else:
+ # Convert the end in the byte string to the end in the object.
+ self._state_opts = 0
+ if self._is_patn_utf:
+ self._state_ofst, self._state_obj_ofst = codeunit_to_codepoint(
+ self._subj, mtch_end, self._state_ofst, self._state_obj_ofst
+ )
+ else:
+ self._state_obj_ofst = mtch_end
+ self._state_ofst = self._state_obj_ofst
+
+ # Create new buffer for match object to own
+ subj_copy = get_buffer(self._subj.obj)
+ return Match._from_data(
+ mtch, self._pattern, subj_copy, self._state_ofst, self._state_opts
+ )
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+
+
+cdef int free_buffer(Py_buffer *pybuf)
+
+cdef Py_buffer * get_buffer(object obj) except NULL
+
+cdef (size_t, size_t) codeunit_to_codepoint(
+ Py_buffer *pybuf,
+ size_t codeunit_idx,
+ size_t cur_codeunit_idx, size_t cur_codepoint_idx
+)
+cdef (size_t, size_t) codepoint_to_codeunit(
+ Py_buffer *pybuf,
+ size_t codepoint_idx,
+ size_t cur_codeunit_idx, size_t cur_codepoint_idx
+)
+
+cdef void * raise_from_rc(int errorcode, object context_msg) except NULL
--- /dev/null
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdlib cimport malloc, free
+from libc.stdint cimport uint8_t
+from cpython cimport Py_buffer
+from cpython.buffer cimport (
+ PyObject_CheckBuffer,
+ PyBuffer_IsContiguous,
+ PyObject_GetBuffer,
+ PyBuffer_FillInfo,
+ PyBuffer_Release
+)
+from cpython.unicode cimport (
+ PyUnicode_Check
+)
+cdef extern from "Python.h":
+ int PyUnicode_1BYTE_KIND
+ int PyUnicode_2BYTE_KIND
+ int PyUnicode_4BYTE_KIND
+ unsigned int PyUnicode_KIND(object o)
+ void *PyUnicode_DATA(object o)
+ const char * PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t *size)
+
+# Local imports.
+from .libpcre2 cimport *
+from .exceptions import LibraryError, CompileError, MatchError
+
+
+cdef int free_buffer(Py_buffer *pybuf):
+ """ Safe free a buffer pointer, ensuring it first releases it's reference
+ """
+ if pybuf is not NULL:
+ PyBuffer_Release(pybuf)
+ free(pybuf)
+ return 0
+
+
+cdef Py_buffer * get_buffer(object obj) except NULL:
+ """ Get a Python buffer from an object, encoding via UTF-8 if unicode
+ based
+ """
+ cdef const char *sptr = NULL
+ cdef Py_ssize_t length = 0
+
+ pybuf = <Py_buffer *>malloc(sizeof(Py_buffer))
+ if not pybuf:
+ raise MemoryError()
+
+ # Process unicode and derivative objects.
+ if PyUnicode_Check(obj):
+ sptr = PyUnicode_AsUTF8AndSize(obj, &length)
+ fill_buf_rc = PyBuffer_FillInfo(pybuf, obj, <void *>sptr, length, 1, 0)
+ if fill_buf_rc < 0:
+ free_buffer(pybuf)
+ raise ValueError("Could not fill internal buffer")
+
+ # Handle all other bytes-like objects.
+ else:
+ if PyObject_CheckBuffer(obj):
+ get_buffer_rc = PyObject_GetBuffer(obj, pybuf, 0)
+ if not PyBuffer_IsContiguous(pybuf, b"A"):
+ free_buffer(pybuf)
+ raise ValueError("Bytes-like object must be contiguous")
+ else:
+ free(pybuf)
+ raise ValueError("Input must be string or bytes-like")
+
+ return pybuf
+
+
+cdef (size_t, size_t) codeunit_to_codepoint(
+ Py_buffer *pybuf,
+ size_t codeunit_idx,
+ size_t cur_codeunit_idx, size_t cur_codepoint_idx
+):
+ """ Convert a code unit index to a code point index
+ """
+ while cur_codeunit_idx < codeunit_idx:
+ if (((<uint8_t *>pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80:
+ cur_codepoint_idx += 1
+ cur_codeunit_idx += 1
+ return cur_codeunit_idx, cur_codepoint_idx
+
+
+cdef (size_t, size_t) codepoint_to_codeunit(
+ Py_buffer *pybuf,
+ size_t codepoint_idx,
+ size_t cur_codeunit_idx, size_t cur_codepoint_idx
+):
+ """
+ """
+ while cur_codepoint_idx < codepoint_idx:
+ cur_codeunit_idx += 1
+ if (((<uint8_t *>pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80:
+ cur_codepoint_idx += 1
+ return cur_codeunit_idx, cur_codepoint_idx
+
+
+cdef void * raise_from_rc(int errorcode, object context_msg) except NULL:
+ """ Raise the appropriate error type from the given error code
+
+ Raises one of the custom exception classes defined in this module. Each
+ exception corresponds to a set of error codes defined in PCRE2. Error
+ messages are retrieved from PCRE2.
+ """
+ # Match against error code classes.
+ if errorcode > 0:
+ raise CompileError(errorcode, context_msg)
+
+ elif errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL:
+ raise MatchError(errorcode, context_msg)
+
+ else:
+ raise LibraryError(errorcode, context_msg)
--- /dev/null
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+
+def test_match_groups():
+ assert pcre2.match('a', 'a').groups() == ()
+ assert pcre2.match('(a)', 'a').groups() == ('a',)
+
+ assert pcre2.match(b'a', b'a').groups() == ()
+ assert pcre2.match(b'(a)', b'a').groups() == (b'a',)
+
+ for a in ("\xe0", "\u0430", "\U0001d49c"):
+ assert pcre2.match(a, a).groups() == ()
+ assert pcre2.match('(%s)' % a, a).groups() == (a,)
--- /dev/null
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+
+
+# All tests should match successfully.
+test_data_match_bounds = [
+ (b".*", "aba•ba••ba•••b".encode(), 0, 0, 0, 0, 26),
+ (".*", "aba•ba••ba•••b", 0, 0, 0, 0, 14),
+]
+@pytest.mark.parametrize("pattern,subject,options,offset,group,start,end", test_data_match_bounds)
+def test_match_bounds(pattern, subject, options, offset, group, start, end):
+ p = pcre2.compile(pattern, options=options)
+ m = p.match(subject, offset=offset)
+ assert (m.start(group), m.end(group)) == (start, end)
+
+
+test_data_match_substring = [
+ (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()),
+ (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"),
+]
+@pytest.mark.parametrize("pattern,subject,options,offset,substring", test_data_match_substring)
+def test_match_substring(pattern, subject, options, offset, substring):
+ p = pcre2.compile(pattern, options=options)
+ m = p.match(subject, offset=offset)
+ assert m.substring() == substring
+
+
+test_data_match_expand = [
+ (b"[abc]*", b"", b"dabacbaccbacccb", 0, 0, b"dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", 0, 0, "dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", 0, 1, "d"),
+]
+@pytest.mark.parametrize(
+ "pattern,replacement,subject,options,offset,result", test_data_match_expand
+)
+def test_match_expand(pattern, replacement, subject, options, offset, result):
+ p = pcre2.compile(pattern, options=options)
+ m = p.match(subject, offset=offset)
+ assert m.expand(replacement) == result
\ No newline at end of file
--- /dev/null
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+from pcre2.consts import CompileOption
+
+
+test_data_pattern_compile_success = [
+ (b"a+b+c*d*", 0, "SUCCESS"),
+ (b"(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+ (b"(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+ ("å+∫+ç*∂*".encode(), 0, "SUCCESS"),
+ ("a+b+c*d*", 0, "SUCCESS"),
+ ("(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+ ("(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+ ("(?<<foo>a+b+)c*d*", 0, "COMPILE_ERROR"),
+ ("(?<foo>a+b+)c*d*(?<foo>a+b+)", 0, "COMPILE_ERROR"),
+ ("(?<foo>a+b+)c*d*(?<foo>a+b+)", pcre2.CompileOption.DUPNAMES, "SUCCESS"),
+ ("å+∫+ç*∂*", 0, "SUCCESS"),
+ ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"),
+]
+@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success)
+def test_pattern_compile_success(pattern, options, return_code):
+ try:
+ p = pcre2.compile(pattern, options=options)
+ rc = "SUCCESS"
+ assert p.jit_size == 0
+ except CompileError as e:
+ rc = "COMPILE_ERROR"
+ except LibraryError as e:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success)
+def test_pattern_jit_compile_success(pattern, options, return_code):
+ try:
+ p = pcre2.compile(pattern, options=options, jit=True)
+ rc = "SUCCESS"
+ assert p.jit_size > 0
+ except CompileError as e:
+ rc = "COMPILE_ERROR"
+ except LibraryError as e:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+
+test_data_pattern_name_dict = [
+ (b"(?<foo>a+b+)c*d*", 0, {1: b"foo"}),
+ ("(?<foo>a+b+)c*d*", 0, {1: "foo"}),
+ ("(?<ƒøø>a+b+)c*d*", 0, {1: "ƒøø"}),
+ ("(?<foo>a+b+)c*d*(?<bar>a+b+)", 0, {1: "foo", 2: "bar"}),
+ ("(?<foo>a+b+)c*(.+)d*(?<bar>a+b+)", 0, {1: "foo", 3: "bar"}),
+ ("(?<foo>a+b+)c*d*(?<foo>a+b+)", pcre2.CompileOption.DUPNAMES, {1: "foo", 2: "foo"}),
+]
+@pytest.mark.parametrize("pattern,options,name_dict", test_data_pattern_name_dict)
+def test_pattern_name_dict(pattern, options, name_dict):
+ p = pcre2.compile(pattern, options=options)
+ assert p.name_dict() == name_dict
+
+
+test_data_pattern_match_success = [
+ (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"),
+ (".*", "abacbaccbacccb", 0, 0, "SUCCESS"),
+ ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"),
+ ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"),
+ ("ab", "abacbaccbacccb", 0, 2, "MATCH_ERROR"),
+ ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"),
+]
+@pytest.mark.parametrize(
+ "pattern,subject,options,offset,return_code", test_data_pattern_match_success
+)
+def test_pattern_match_success(pattern, subject, options, offset, return_code):
+ p = pcre2.compile(pattern, options=options)
+ try:
+ m = p.match(subject, offset=offset)
+ rc = "SUCCESS"
+ except MatchError as e:
+ rc = "MATCH_ERROR"
+ except LibraryError as e:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+
+test_data_pattern_scan_length = [
+ (b".+", b"abacbaccbacccb", 0, 1),
+ (b".*", b"abacbaccbacccb", 0, 2),
+ (".+", "abacbaccbacccb", 0, 1),
+ (".*", "abacbaccbacccb", 0, 2),
+ ("[abc]*", "dabacbaccbacccb", 0, 3),
+ ("ac{2,}b", "abacbaccbacccb", 0, 2),
+ ("a•{2,}b", "aba•ba••ba•••b", 0, 2),
+ ("a•*b", "aba•ba••ba•••b", 0, 4),
+ ("ab", "abacbaccbacccb", 2, 0),
+]
+@pytest.mark.parametrize(
+ "pattern,subject,offset,iter_length", test_data_pattern_scan_length
+)
+def test_pattern_scan_length(pattern, subject, offset, iter_length):
+ p = pcre2.compile(pattern)
+ s = p.scan(subject, offset=offset)
+ assert len(list(iter(s))) == iter_length
+
+
+test_pattern_substitute = [
+ (b"[abc]*", b"", b"dabacbaccbacccb", False, False, 0, b"dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", False, False, 0, "dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", False, False, 1, "d"),
+ ("a(•{2,})b", "a•b", "aba•ba••ba•••b", True, False, 0, "aba•ba•ba•b"),
+ ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", True, True, 0, "aba•ba$1ba$1b"),
+]
+@pytest.mark.parametrize(
+ "pattern,replacement,subject,suball,literal,offset,result", test_pattern_substitute
+)
+def test_pattern_substitute(pattern, replacement, subject, suball, literal, offset, result):
+ p = pcre2.compile(pattern)
+ assert p.substitute(replacement, subject, suball=suball, literal=literal, offset=offset) == result
+
+def test_pattern_findall():
+ p = pcre2.compile(r'(\w+)=(\d+)')
+ assert p.findall('set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')]
+ s = bytes(range(128)).decode()
+ p2 = pcre2.compile(r'[0-9--1]')
+ assert p2.findall(s) == list('-./0123456789')
+ p3 = pcre2.compile(r'[%--1]')
+ assert p3.findall(s) == list("%&'()*+,-1")
+ p4 = pcre2.compile(r'[%--]')
+ assert p4.findall(s) == list("%&'()*+,-")
+ p5 = pcre2.compile(r'[0-9&&1]')
+ assert p5.findall(s) == list('&0123456789')
+ p6 = pcre2.compile(r'[\d&&1]')
+ assert p6.findall(s) == list('&0123456789')
+ p7 = pcre2.compile(r'[0-9||a]')
+ assert p7.findall(s) == list('0123456789a|')
+ p8 = pcre2.compile(r'[\d||a]')
+ assert p8.findall(s) == list('0123456789a|')
+ p9 = pcre2.compile(r'[0-9~~1]')
+ assert p9.findall(s) == list('0123456789~')
+ p10 = pcre2.compile(r'[\d~~1]')
+ assert p10.findall(s) == list('0123456789~')
+ p11 = pcre2.compile(r'[[0-9]|]')
+ assert p11.findall(s) == list('0123456789[]')
+
+ for reps in '*', '+', '?', '{1}':
+ for mod in '', '?':
+ pattern = '.' + reps + mod + 'yz'
+ assert pcre2.compile(pattern, pcre2.S).findall('xyz') == ['xyz'], pattern
+ pattern = pattern.encode()
+ assert pcre2.compile(pattern, pcre2.S).findall(b'xyz') == [b'xyz'], pattern
+
+
+def test_pattern_jit_findall():
+ assert pcre2.findall(r'(\w+)=(\d+)', 'set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')]
+ assert pcre2.findall(":+", "abc") == []
+ assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
+ assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
+
+ for x in ("\xe0", "\u0430", "\U0001d49c"):
+ xx = x * 2
+ xxx = x * 3
+ string = "a%sb%sc%sd" % (x, xx, xxx)
+ assert pcre2.findall("%s+" % x, string) == [x, xx, xxx]
+ assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx]
+
+ assert len(pcre2.findall(r"\b", "a")) == 2
+ assert len(pcre2.findall(r"\B", "a")) == 0
+ assert len(pcre2.findall(r"\b", " ")) == 0
+ assert len(pcre2.findall(r"\b", " ")) == 0
+ assert len(pcre2.findall(r"\B", " ")) == 2
+
+ s = bytes(range(128)).decode()
+ assert pcre2.findall(r'[--1]', s) == list('-./01')
+ assert pcre2.findall(r'[&&1]', s) == list('&1')
+ assert pcre2.findall(r'[||1]', s) == list('1|')
+ assert pcre2.findall(r'[~~1]', s) == list('1~')
+
+ assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ['a']
+
+ assert pcre2.findall(r'a++', 'aab') == ['aa']
+ assert pcre2.findall(r'a*+', 'aab') == ['aa', '', '']
+ assert pcre2.findall(r'a?+', 'aab') == ['a', 'a', '', '']
+ assert pcre2.findall(r'a{1,3}+', 'aab') == ['aa']
+
+ assert pcre2.findall(r'(?:ab)++', 'ababc') == ['abab']
+ assert pcre2.findall(r'(?:ab)*+', 'ababc') == ['abab', '', '']
+ assert pcre2.findall(r'(?:ab)?+', 'ababc') == ['ab', 'ab', '', '']
+ assert pcre2.findall(r'(?:ab){1,3}+', 'ababc') == ['abab']
+
+ assert pcre2.findall(r'(?>a+)', 'aab') == ['aa']
+ assert pcre2.findall(r'(?>a*)', 'aab') == ['aa', '', '']
+ assert pcre2.findall(r'(?>a?)', 'aab') == ['a', 'a', '', '']
+ assert pcre2.findall(r'(?>a{1,3})', 'aab') == ['aa']
+
+ assert pcre2.findall(r'(?>(?:ab)+)', 'ababc') == ['abab']
+ assert pcre2.findall(r'(?>(?:ab)*)', 'ababc') == ['abab', '', '']
+ assert pcre2.findall(r'(?>(?:ab)?)', 'ababc') == ['ab', 'ab', '', '']
+ assert pcre2.findall(r'(?>(?:ab){1,3})', 'ababc') == ['abab']
+
+ import re
+ b = 'y\u2620y\u2620y'.encode('utf-8')
+ assert len(pcre2.findall(re.escape('\u2620'.encode('utf-8')), b)) == 2
+
+
+def test_pattern_split():
+ pattern = "[\u002E\u3002\uFF0E\uFF61]"
+ assert pcre2.compile(pattern).split("a.b.c") == ['a','b','c']
+
+
+def test_pattern_jit_split():
+ assert pcre2.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c']
+ assert pcre2.split(":+", ":a:b::c") == ['', 'a', 'b', 'c']
+ assert pcre2.split("(:+)", ":a:b::c") == ['', ':', 'a', ':', 'b', '::', 'c']
+
+ assert pcre2.split(b":", b":a:b::c") == [b'', b'a', b'b', b'', b'c']
+ assert pcre2.split(b":+", b":a:b::c") == [b'', b'a', b'b', b'c']
+ assert pcre2.split(b"(:+)", b":a:b::c") == [b'', b':', b'a', b':', b'b', b'::', b'c']
+
+ for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
+ "\U0001d49c\U0001d49e\U0001d4b5"):
+ string = ":%s:%s::%s" % (a, b, c)
+ assert pcre2.split(":", string) == ['', a, b, '', c]
+ assert pcre2.split(":+", string) == ['', a, b, c]
+ assert pcre2.split("(:+)", string) == ['', ':', a, ':', b, '::', c]
+
+ assert pcre2.split("(?::+)", ":a:b::c") == ['', 'a', 'b', 'c']
+ assert pcre2.split("([b:]+)", ":a:b::c") == ['', ':', 'a', ':b::', 'c']
+ assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ['', 'a', '', '', 'c']
+
+ assert pcre2.split(":", ":a:b::c", 2) == ['', 'a', 'b::c']
+ assert pcre2.split(":", ":a:b::c", maxsplit=2) == ['', 'a', 'b::c']
+ assert pcre2.split(':', 'a:b:c:d', maxsplit=2) == ['a', 'b', 'c:d']
+ assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c']
+ assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c']